diff options
author | tradke <tradke@38c3d835-c875-442e-b0fe-21c19ce1d001> | 1999-10-05 01:24:27 +0000 |
---|---|---|
committer | tradke <tradke@38c3d835-c875-442e-b0fe-21c19ce1d001> | 1999-10-05 01:24:27 +0000 |
commit | 3aa41187b549ff9a792d673e86efe5220848d73f (patch) | |
tree | 0bdc27f2b94a7dfd29cf4d0c25c2cd819fd833a2 /src | |
parent | 8a113f1371d777ca62b6c690e2f44bbebebd79c0 (diff) |
Added Jonghyun's IOPanda thorn.
Tested on O2K, needs porting/testing on other architectures.
git-svn-id: http://svn.cactuscode.org/arrangements/CactusPUGHIO/IOPanda/trunk@2 38c3d835-c875-442e-b0fe-21c19ce1d001
Diffstat (limited to 'src')
60 files changed, 10165 insertions, 0 deletions
diff --git a/src/DumpVar.c b/src/DumpVar.c new file mode 100644 index 0000000..86ed6d9 --- /dev/null +++ b/src/DumpVar.c @@ -0,0 +1,202 @@ +/*@@ + @file DumpVar.c + @date 01 Oct 1999 + @author Jonghyun Lee + @desc Do the actual writing of a 3D grid function, + for output or for checkpointing + @enddesc + @history + @hendhistory + @@*/ + +#include <stdio.h> +#include <stdlib.h> +#ifdef SGI +#include <time.h> +#endif + +#include "cctk.h" +#include "cctk_Flesh.h" +#include "cctk_Groups.h" +#include "cctk_GroupsOnGH.h" +#include "cctk_Comm.h" +#include "cctk_WarnLevel.h" +#include "cctk_GHExtensions.h" +#include "cctk_parameters.h" +#ifdef CACTUSPUGH_PUGH +#include "CactusPUGH/PUGH/src/include/pugh.h" +#endif +#include "CactusBase/IOUtil/src/ioGH.h" +#include "ioPandaGH.h" + + +#define IOTAGBASE 20000 /* This may break on more than 2000 processors */ + + +static char *char_time_date = NULL; + + +void IOPanda_getDumpData (cGH *GH, int index, int timelevel, void **outme, + int *free_outme, CCTK_INT4 bnd [9], int element_size) +{ + DECLARE_CCTK_PARAMETERS + int i; + int myproc; + ioGH *ioUtilGH; + pGH *pughGH; + CCTK_REAL4 *single_ptr; + CCTK_REAL *real_ptr; + CCTK_CHAR *char_ptr; + CCTK_INT *int_ptr; + void *data = CCTK_VarDataPtrI (GH, timelevel, index); + + /* to make the compiler happy */ + single_ptr = NULL; + real_ptr = NULL; + char_ptr = NULL; + int_ptr = NULL; + + ioUtilGH = (ioGH *) GH->extensions [CCTK_GHExtensionHandle ("IO")]; + pughGH = (pGH *) GH->extensions [CCTK_GHExtensionHandle ("PUGH")]; + + myproc = CCTK_MyProc (GH); + + if (ioUtilGH->downsample_x == 1 && + ioUtilGH->downsample_y == 1 && + ioUtilGH->downsample_z == 1) { + + if (ioUtilGH->out_single) { + single_ptr = (CCTK_REAL4 *) malloc (pughGH->npoints*sizeof (CCTK_REAL4)); + + for (i = 0; i < pughGH->npoints; i++) + single_ptr [i] = (CCTK_REAL4) ((CCTK_REAL *) data) [i]; + + *outme = single_ptr; + *free_outme = 1; + } else { + *outme = data; + *free_outme = 0; + } + + for (i = 0; i < 3; i++) { + bnd [i] = GH->cctk_lbnd[i]; /* the bounds */ + bnd [i+3] = GH->cctk_lsh[i]; /* the sizes */ + bnd [i+6] = GH->cctk_gsh[i]; /* the global space */ + } + + } else { + + int start [3], end [3]; + int i, j, k, l; + + /* Downsampling code ... */ + bnd [6] = GH->cctk_gsh[0] / ioUtilGH->downsample_x; + if (GH->cctk_gsh[0] % ioUtilGH->downsample_x) + bnd [6]++; + bnd [7] = GH->cctk_gsh[1] / ioUtilGH->downsample_y; + if (GH->cctk_gsh[1] % ioUtilGH->downsample_y) + bnd [7]++; + bnd [8] = GH->cctk_gsh[2] / ioUtilGH->downsample_z; + if (GH->cctk_gsh[2] % ioUtilGH->downsample_z) + bnd [8]++; + + if (verbose) + printf ("Downsampled sizes (%d, %d, %d) -> (%d, %d, %d)\n", + GH->cctk_gsh[0], GH->cctk_gsh[1], GH->cctk_gsh[2], + (int) bnd [6], (int) bnd [7], (int) bnd [8]); + + /* Now figure out the local downsampling */ + /* The local starts are the lb modded into the downsample */ + for (i = 0; i < 3; i++) { + int downsample; + + if (i == 0) + downsample = ioUtilGH->downsample_x; + else if (i == 1) + downsample = ioUtilGH->downsample_y; + else + downsample = ioUtilGH->downsample_z; + + bnd [i] = GH->cctk_lbnd[i] / downsample; + start [i] = bnd [i] * downsample; + if (start [i] < + GH->cctk_lbnd[i] + pughGH->ownership [PUGH_VERTEXCTR][i][0]) { + start [i] += downsample; + bnd [i] ++; + } + end [i] = ((GH->cctk_lbnd [i] + + pughGH->ownership [PUGH_VERTEXCTR][i][1] - 1) / downsample) + * downsample; + bnd [i+3] = (end [i] - start [i]) / downsample + 1; + } + + if (verbose) { + printf ("Downsample ranges (%d, %d, %d) -> (%d, %d, %d)\n", + start [0], start [1], start [2], + end [0], end [1], end [2]); + printf ("Local size/bound (%d, %d, %d) (%d, %d, %d)\n", + (int) bnd [3], (int) bnd [4], (int) bnd [5], + (int) bnd [0], (int) bnd [1], (int) bnd [2]); + } + + /* compute local ranges */ + for (i = 0; i < 3; i++) { + start [i] -= GH->cctk_lbnd [i]; + end [i] -= GH->cctk_lbnd [i]; + } + + *outme = malloc (bnd [3] * bnd [4] * bnd [5] * element_size); + *free_outme = 1; + + /* I hate it to repeat the loops for each case label + but that way produces much more efficient code */ + l = 0; + switch (CCTK_VarTypeI (index)) { + case CCTK_VARIABLE_CHAR: + char_ptr = (CCTK_CHAR *) *outme; + for (k = start [2]; k <= end [2]; k += ioUtilGH->downsample_z) + for (j = start [1]; j <= end [1]; j += ioUtilGH->downsample_y) + for (i = start [0]; i <= end [0]; i += ioUtilGH->downsample_x) + char_ptr [l++] = ((CCTK_CHAR *) data) [DI (pughGH, i, j, k)]; + break; + + case CCTK_VARIABLE_INT: + int_ptr = (CCTK_INT *) *outme; + for (k = start [2]; k <= end [2]; k += ioUtilGH->downsample_z) + for (j = start [1]; j <= end [1]; j += ioUtilGH->downsample_y) + for (i = start [0]; i <= end [0]; i += ioUtilGH->downsample_x) + int_ptr [l++] = ((CCTK_INT *) data) [DI (pughGH, i, j, k)]; + break; + + case CCTK_VARIABLE_REAL: + if (ioUtilGH->out_single) + single_ptr = (CCTK_REAL4 *) *outme; + else + real_ptr = (CCTK_REAL *) *outme; + for (k = start [2]; k <= end [2]; k += ioUtilGH->downsample_z) + for (j = start [1]; j <= end [1]; j += ioUtilGH->downsample_y) + for (i = start [0]; i <= end [0]; i += ioUtilGH->downsample_x) + if (ioUtilGH->out_single) + single_ptr [l++] = (CCTK_REAL4) + (((CCTK_REAL *) data) [DI (pughGH, i, j, k)]); + else + real_ptr [l++] = ((CCTK_REAL *) data) [DI (pughGH, i, j, k)]; + break; + + default: + CCTK_WARN (1, "Unsupported variable type in IOPanda_getDumpData"); + return; + } + } + + if (verbose) { + printf ("Global size: %d %d %d\n", + (int) bnd [6], (int) bnd [7], (int) bnd [8]); + printf ("Lower bound: %d %d %d\n", + (int) bnd [0], (int) bnd [1], (int) bnd [2]); + printf ("Chunk size : %d %d %d\n", + (int) bnd [3], (int) bnd [4], (int) bnd [5]); + } +} + + diff --git a/src/GHExtension.c b/src/GHExtension.c new file mode 100644 index 0000000..210db6a --- /dev/null +++ b/src/GHExtension.c @@ -0,0 +1,90 @@ + /*@@ + @file GHExtension.c + @date 01 Oct 1999 + @author Jonghyun Lee + @desc IOPanda GH extension stuff + @enddesc + @history + @endhistory + @@*/ + +/*#define DEBUG_IO*/ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> + +#include "cctk_Flesh.h" +#include "cctk_Groups.h" +#include "cctk_Comm.h" +#include "cctk_Misc.h" +#include "cctk_GHExtensions.h" +#include "cctk_parameters.h" +#include "cctk_WarnLevel.h" +#ifdef CACTUSPUGH_PUGH +#include "CactusPUGH/PUGH/src/include/pugh.h" +#endif +#include "CactusBase/IOUtil/src/ioGH.h" +#include "ioPandaGH.h" + +void Panda_Create(int, int); + +void *IOPanda_SetupGH (tFleshConfig *config, int convergence_level, cGH *GH) +{ + int i, numvars; + pandaGH *newGH; + + numvars = CCTK_NumVars (); + + newGH = (pandaGH *) malloc (sizeof (pandaGH)); + newGH->IO_3Dnum = (int *) malloc (numvars * sizeof (int)); + newGH->IO_3Dlast = (int *) malloc (numvars * sizeof (int)); + + return (newGH); +} + +int IOPanda_InitGH (cGH *GH) +{ + DECLARE_CCTK_PARAMETERS + int i; + ioGH *ioUtilGH; + pandaGH *myGH; + + /* get the handles for IOUtil and IOPanda extensions */ + ioUtilGH = (ioGH *) GH->extensions [CCTK_GHExtensionHandle ("IO")]; + myGH = (pandaGH *) GH->extensions [CCTK_GHExtensionHandle ("IOPanda")]; + + /* How often to output */ + myGH->IO_3Devery = out_every; + if (out3D_every > 0) + myGH->IO_3Devery = out3D_every; + + InitIONum (myGH->IO_3Dnum, out3D_vars); + + /* Deal with the output directories */ + myGH->outpfx_3D = outdir; + if (!CCTK_Equals(outdir3D,"outdir")) + myGH->outpfx_3D = outdir3D; + + /* Create the output directories */ + if (myGH->IO_3Devery > 0) { + if (CCTK_MyProc (GH) == 0) { + FILE *fp; + + if (CCTK_mkdir (myGH->outpfx_3D) != 0) + CCTK_WARN (2,"Problem creating IO 3D directory"); + fp = fopen("FILEPREFIX", "w"); + fprintf(fp, "%s", myGH->outpfx_3D); + fclose(fp); + } + } + + for (i=0; i<CCTK_NumVars(); i++) + myGH->IO_3Dlast [i] = -1; + + myGH->fileList_3D = NULL; + + Panda_Create(ioUtilGH->ioproc_every, 1); + + return (0); +} diff --git a/src/Output3D.c b/src/Output3D.c new file mode 100644 index 0000000..41143a9 --- /dev/null +++ b/src/Output3D.c @@ -0,0 +1,487 @@ + /*@@ + @file Output3D.c + @date 01 Oct 1999 + @author Jonghyun Lee + @desc Functions to deal 3D output of GFs + @enddesc + @history + @hendhistory + @@*/ + +#include <stdio.h> +#include <assert.h> +#include <stdlib.h> +#include <string.h> + +#include "cctk.h" +#include "cctk_Flesh.h" +#include "cctk_Groups.h" +#include "cctk_parameters.h" +#include "cctk_GHExtensions.h" +#include "cctk_WarnLevel.h" +#include "cctk_Comm.h" +#include "ioPandaGH.h" +#include "Panda/c_interface.h" +#include "CactusBase/IOUtil/src/ioGH.h" +#ifdef CACTUSPUGH_PUGH +#include "CactusPUGH/PUGH/src/include/pugh.h" +#endif +#ifdef SGI +#include <time.h> +#endif + +#include "external/IEEEIO/src/IOProtos.h" + +int IOPanda_Output3DVarAs (cGH *GH, const char *var, const char *alias); +int IOPanda_TimeFor3D (cGH *GH, int index); +void IOPanda_Timestep (cGH *GH, int index, const char *alias); +void IOPanda_AddCommonAttributes (cGH *GH, int index, int timelevel, CCTK_INT *gsz, char *fname); +void IOPanda_IEEEIOStructDump (cGH *GH, char *fname); + +/*@@ + @routine IOPanda_Output3DGH + @date Sat March 6 1999 + @author Gabrielle Allen + @desc + Loops over all variables and outputs them if necessary + @enddesc + @calls CCTK_GHExtensionHandle + CCTK_NumVars + CCTK_ImplementationFromVar + CCTK_VarName + IOPanda_TimeFor3D + IOPanda_Output3DVarAs + @calledby + @history + + @endhistory + @var GH + @vdesc Pointer to CCTK GH + @vtype cGH + @vio in + @vcomment + @endvar +@@*/ + +int IOPanda_Output3DGH (cGH *GH) +{ + int i; + pandaGH *myGH; + char *implementation; + char *name; + char *fullname; + DECLARE_CCTK_PARAMETERS + + /* Get the GH extension for IOPanda */ + myGH = (pandaGH *) GH->extensions [CCTK_GHExtensionHandle ("IOPanda")]; + + if (myGH->IO_3Devery <= 0) + return; + + /* Loop over all variables */ + for (i = 0; i < CCTK_NumVars (); i++) { + if (IOPanda_TimeFor3D (GH, i)) { + implementation = CCTK_ImpFromVarI (i); + name = CCTK_VarName (i); + fullname = (char *) malloc (strlen (implementation) + + strlen (name) + 3); + assert (fullname); + sprintf (fullname, "%s::%s", implementation, name); + + if (verbose) { + printf ("IOPanda Output3DGH : \n"); + printf (" fullname/name = %s/%s\n", fullname, name); + } + + IOPanda_Output3DVarAs (GH, fullname, name); + + free (fullname); + + /* Register another 3D output for this GF */ + myGH->IO_3Dnum [i]++; + + /* Register GF as having 3D output this iteration */ + myGH->IO_3Dlast [i] = GH->cctk_iteration; + } + } + + return (0); +} + + +/*@@ + @routine IOPanda_Output3DVarAs + @date Sat March 6 1999 + @author Gabrielle Allen + @desc + unconditional output of a variable using the IOPanda 3D output method + @enddesc + @calls CCTK_DecomposeName + CCTK_VarIndex + CCTK_GHExtensionHandle + IOPanda_Write3D + @calledby IOPanda_Output3DGH + @history + + @endhistory + @var GH + @vdesc Pointer to CCTK GH + @vtype cGH + @vio in + @vcomment + @endvar + @var fullname + @vdesc complete name of variable to output + @vtype const char * + @vio in + @vcomment + @endvar + @var alias + @vdesc alias name of variable to output (used to generate output filename) + @vtype const char * + @vio in + @vcomment + @endvar +@@*/ + +int IOPanda_Output3DVarAs (cGH *GH, const char *fullname, const char *alias) +{ + DECLARE_CCTK_PARAMETERS + int index; + pandaGH *myGH; + + index = CCTK_VarIndex(fullname); + + /* Get the GH extension for IOPanda */ + myGH = (pandaGH *) GH->extensions [CCTK_GHExtensionHandle ("IOPanda")]; + + if (verbose) { + printf ("\nIn IOPanda Output3DVarAs\n-------------------\n"); + printf (" Fullname = -%s-\n", fullname); + printf (" Alias = -%s-\n", alias); + printf (" Index = %d\n", index); + } + + /* Do the 3D output */ + IOPanda_Timestep (GH, index, alias); + + return (0); +} + + +/*@@ + @routine IOPanda_TimeFor3D + @date Sat March 6 1999 + @author Gabrielle Allen + @desc + Decides if it is time to output a variable using the IOPanda 3D output + method + @enddesc + @calls CCTK_GHExtensionHandle + CCTK_GroupTypeFromVarI + CCTK_WARN + CCTK_QueryGroupStorageI + CCTK_GroupNameFromVarI + @calledby IOPanda_Output3DGH + @history + + @endhistory + @var GH + @vdesc Pointer to CCTK GH + @vtype cGH + @vio in + @vcomment + @endvar + @var index + @vdesc index of variable + @vtype int + @vio in + @vcomment + @endvar +@@*/ + +int IOPanda_TimeFor3D (cGH *GH, int index) +{ + pandaGH *myGH; + + /* Get the GH extension for IOPanda */ + myGH = (pandaGH *) GH->extensions [CCTK_GHExtensionHandle ("IOPanda")]; + + /* Check this GF should be output */ + if (! (myGH->IO_3Dnum [index] != 0 && + GH->cctk_iteration % myGH->IO_3Devery == 0)) + return (0); + + /* Check GF not already output this iteration */ + if (myGH->IO_3Dlast [index] == GH->cctk_iteration) { + CCTK_WARN (2, "Already done 3D output in IOPanda"); + return (0); + } + + /* Check GF has storage */ + if (! CCTK_QueryGroupStorageI (GH, + CCTK_GroupIndexFromVarI(index))) { + char *fullname = CCTK_FullName (index); + char *msg = (char *) malloc (80 + strlen (fullname)); + + sprintf (msg, "No IOPandaIO 3D output for '%s' (no storage)", fullname); + CCTK_WARN (2, msg); + free (fullname); + free (msg); + return (0); + } + + return (1); +} + + +/*@@ + @routine IOPanda_TriggerOutput3D + @date Sat March 6 1999 + @author Gabrielle Allen + @desc + Triggers the output a variable using the IOPanda 3D output + method + @enddesc + @calls CCTK_GHExtensionHandle + CCTK_VarName + IOPanda_Write3D + @calledby + @history + + @endhistory + @var GH + @vdesc Pointer to CCTK GH + @vtype cGH + @vio in + @vcomment + @endvar + @var index + @vdesc index of variable to output + @vtype int + @vio in + @vcomment + @endvar +@@*/ + +int IOPanda_TriggerOutput3D (cGH *GH, int index) +{ + DECLARE_CCTK_PARAMETERS + pandaGH *myGH; + char *varname; + + varname = CCTK_VarName (index); + + /* Get the GH extension for IOPanda */ + myGH = (pandaGH *) GH->extensions [CCTK_GHExtensionHandle ("IOPanda")]; + + if (verbose) { + printf("\nIn IOPanda TriggerOutput3D\n---------------------\n"); + printf(" Index = %d\n", index); + printf(" Variable = -%s-\n", varname); + } + + /* Do the 3D output */ + IOPanda_Timestep (GH, index, varname); + + /* Register another 3D output for this GF */ + myGH->IO_3Dnum [index]++; + + /* Register GF as having 3D output this iteration */ + myGH->IO_3Dlast [index] = GH->cctk_iteration; + + return (0); +} + +void IOPanda_Timestep(cGH *GH, int index, const char *alias) +{ + DECLARE_CCTK_PARAMETERS + void *data; + int tmp[1], tmp1[3], tmp2[3]; + Distribution dist1[3], dist2[3]; + CCTK_INT4 bnd[9]; + int free_flag, timelevel; + ArrayInfo ainfo; + + ioGH *ioUtilGH; + pGH *pughGH; + + if (CCTK_GroupTypeFromVarI (index) == GROUP_SCALAR) { + printf("##### %s is scalar\n", alias); + return; + } + + ioUtilGH = (ioGH *) GH->extensions [CCTK_GHExtensionHandle ("IO")]; + pughGH = (pGH *) GH->extensions [CCTK_GHExtensionHandle ("PUGH")]; + + ainfo.name_ = (char *)alias; + + ainfo.rank_ = 3; + tmp1[0] = GH->cctk_gsh[2]; + tmp1[1] = GH->cctk_gsh[1]; + tmp1[2] = GH->cctk_gsh[0]; + ainfo.size_ = tmp1; + + switch (CCTK_VarTypeI (index)) { + case CCTK_VARIABLE_CHAR: + ainfo.esize_ = CHAR; + break; + case CCTK_VARIABLE_INT: +#ifdef CCTK_INTEGER_PRECISION_8 + ainfo.esize_ = INT64; +#elif CCTK_INTEGER_PRECISION_4 + ainfo.esize_ = INT32; +#elif CCTK_INTEGER_PRECISION_2 + ainfo.esize_ = INT16; +#endif + break; + case CCTK_VARIABLE_REAL: + if (ioUtilGH->out_single) ainfo.esize_ = FLOAT32; + else { +#ifdef CCTK_REAL_PRECISION_8 + ainfo.esize_ = FLOAT64; +#elif CCTK_REAL_PRECISION_4 + ainfo.esize_ = FLOAT32; +#endif + } + } + + ainfo.mem_rank_ = 3; + tmp2[0] = pughGH->nprocz; tmp2[1] = pughGH->nprocy; tmp2[2] = pughGH->nprocx; + ainfo.mem_layout_ = tmp2; + dist1[0] = dist1[1] = dist1[2] = BLOCK; + ainfo.mem_dist_ = dist1; + + ainfo.disk_rank_ = 1; + dist2[0] = BLOCK; dist2[1] = dist2[2] = NONE; + tmp[0]= ((CCTK_nProcs(GH) - 1) / ioUtilGH->ioproc_every + 1); + + ainfo.disk_layout_ = tmp; + ainfo.disk_dist_ = dist2; + + timelevel = CCTK_NumTimeLevelsFromVarI (index) - 1; + if (timelevel > 0) timelevel--; + + IOPanda_getDumpData(GH, index, timelevel, &data, &free_flag, bnd, + ainfo.esize_); + ainfo.data_ = (char *)data; + ainfo.stencil_width_ = pughGH->nghostzones; + + + PandaTimestep(&ainfo); + IOPanda_AddCommonAttributes(GH, index, timelevel, ainfo.size_, ainfo.name_); + if (PandaIsNewFile(ainfo.name_)) IOPanda_IEEEIOStructDump(GH, ainfo.name_); +} + +void IOPanda_AddCommonAttributes (cGH *GH, int index, int timelevel, + CCTK_INT4 gsz [3], char *fname) +{ + DECLARE_CCTK_PARAMETERS + CCTK_REAL d3_to_IO [6]; /* buffer for writing doubles to IEEEIO */ + CCTK_INT4 i_to_IO; /* buffer for writing an int to IEEEIO */ + char *name, *gname; + ioGH *ioUtilGH; + char *char_time_date = ""; + +#ifdef SGI + time_t t = time(NULL); + char_time_date = asctime (localtime (&t)); +#endif + + /* Get the handle for IO extensions */ + ioUtilGH = (ioGH *) GH->extensions [CCTK_GHExtensionHandle ("IO")]; + + name = CCTK_FullName (index); + + Panda_WriteAttribute (fname, "name", BYTE, strlen (name) + 1, name); + + free (name); + + gname = CCTK_GroupNameFromVarI (index); + Panda_WriteAttribute (fname, "groupname", BYTE, strlen (gname) + 1, gname); + free (gname); + + i_to_IO = CCTK_GroupTypeFromVarI (index); + Panda_WriteAttribute (fname, "grouptype", INT32, + 1, &i_to_IO); + + i_to_IO = CCTK_NumTimeLevelsFromVarI (index); + Panda_WriteAttribute (fname, "ntimelevels", INT32, + 1, &i_to_IO); + + i_to_IO = timelevel; + Panda_WriteAttribute (fname, "timelevel", INT32, + 1, &i_to_IO); + + if (char_time_date && out3D_datestamp) + Panda_WriteAttribute (fname, "date", BYTE, + strlen (char_time_date) + 1, char_time_date); + + Panda_WriteAttribute (fname, "time", FLOAT64, 1,&GH->cctk_time); + + d3_to_IO [0] = CCTK_CoordOrigin ("x"); + d3_to_IO [1] = CCTK_CoordOrigin ("y"); + d3_to_IO [2] = CCTK_CoordOrigin ("z"); + Panda_WriteAttribute (fname, "origin", FLOAT64,3,d3_to_IO); + CCTK_CoordRange (GH, &d3_to_IO [0], &d3_to_IO [3], "x"); + CCTK_CoordRange (GH, &d3_to_IO [1], &d3_to_IO [4], "y"); + CCTK_CoordRange (GH, &d3_to_IO [2], &d3_to_IO [5], "z"); + Panda_WriteAttribute (fname, "min_ext",FLOAT64,3,d3_to_IO); + Panda_WriteAttribute (fname, "max_ext",FLOAT64, 3,d3_to_IO+3); + + d3_to_IO [0] = GH->cctk_delta_space [0] * ioUtilGH->downsample_x; + d3_to_IO [1] = GH->cctk_delta_space [1] * ioUtilGH->downsample_y; + d3_to_IO [2] = GH->cctk_delta_space [2] * ioUtilGH->downsample_z; + Panda_WriteAttribute (fname, "delta", FLOAT64, 3,d3_to_IO); + + if (ioUtilGH->downsample_x > 1 || + ioUtilGH->downsample_y > 1 || + ioUtilGH->downsample_z > 1) { + d3_to_IO [0] = GH->cctk_delta_space [0]; + d3_to_IO [1] = GH->cctk_delta_space [1]; + d3_to_IO [2] = GH->cctk_delta_space [2]; + Panda_WriteAttribute (fname, "evolution_delta", FLOAT64, 3, d3_to_IO); + } + + Panda_WriteAttribute (fname, "global_size", INT32, 3, gsz); + + i_to_IO = CCTK_nProcs (GH); + Panda_WriteAttribute (fname, "nprocs", INT32, 1, &i_to_IO); + + i_to_IO = ioUtilGH->ioproc_every; + Panda_WriteAttribute (fname, "ioproc_every", INT32, 1, &i_to_IO); + + i_to_IO = ioUtilGH->unchunked; + Panda_WriteAttribute (fname, "unchunked", INT32, 1, &i_to_IO); + + i_to_IO = GH->cctk_iteration; + Panda_WriteAttribute (fname, "iteration", INT32, 1, &i_to_IO); +} + + +void IOPanda_IEEEIOStructDump (cGH *GH, char *fname) +{ + + CCTK_INT4 i_temp; + CCTK_REAL d_temp; + ioGH *ioUtilGH; + + + ioUtilGH = (ioGH *) GH->extensions [CCTK_GHExtensionHandle ("IO")]; + + i_temp = GH->cctk_iteration; + Panda_WriteAttribute (fname, "GH$iteration", INT32, + 1, &i_temp); + + i_temp = ioUtilGH->ioproc_every; + Panda_WriteAttribute (fname, "GH$ioproc_every", INT32, + 1, &i_temp); + + i_temp = CCTK_nProcs (GH); + Panda_WriteAttribute (fname, "GH$nprocs", INT32, + 1, &i_temp); + + d_temp = GH->cctk_time; + Panda_WriteAttribute (fname, "GH$time", FLOAT64, + 1, &d_temp); +} diff --git a/src/Panda/App_Info.C b/src/Panda/App_Info.C new file mode 100644 index 0000000..77f1d4b --- /dev/null +++ b/src/Panda/App_Info.C @@ -0,0 +1,96 @@ +#include "definitions.h" +#include "App_Info.h" + +App_Info::App_Info(int app_num, int app_size, int *world_ranks) +{ + int world_size; + + app_num_ = app_num; + app_size_ = app_size; + world_ranks_ = copy_int_list(app_size, world_ranks); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + relative_ranks_ = (int *) malloc(sizeof(int)*world_size); + for(int i=0; i < world_size; i++) + relative_ranks_[i] = -1; + for(i=0; i < app_size_; i++) + relative_ranks_[world_ranks_[i]] = i; + intra_comm_ = NULL; + combine_count_ = 0; + +#ifdef DEBUG + printf("Creating an new App Info object\n"); + printf("App_num = %d App_size = %d\n", app_num_, app_size_); + printf("Ranks relative: world: world-relative\n"); + for(int j=0;j<app_size_;j++) + printf(" %d %d %d\n", j, world_ranks_[j], + relative_ranks_[world_ranks_[j]]); +#endif +} + + +App_Info::~App_Info() +{ + if (world_ranks_ != NULL) free(world_ranks_); + if (relative_ranks_ != NULL) free(relative_ranks_); + if (intra_comm_ != NULL) + { + MPI_Comm_free(intra_comm_); + free(intra_comm_); + intra_comm_ =NULL; + } + world_ranks_ = NULL; + relative_ranks_ = NULL; +} + + +int App_Info::app_num(){ return app_num_;} + +int App_Info::app_size(){ return app_size_;} + +int App_Info::get_master(){ return world_ranks_[0];} + +int App_Info::world_rank(int relative_rank) +{ + return world_ranks_[relative_rank]; +} + +int App_Info::relative_rank(int world_rank) +{ + return relative_ranks_[world_rank]; +} + +void App_Info::set_intra_comm(MPI_Comm *intra_comm) +{ + intra_comm_ = intra_comm; +} + +MPI_Comm* App_Info::intra_comm() +{ + return intra_comm_; +} + +void App_Info::inc_combine_count() +{ + combine_count_++; +} + +int App_Info::combine_count() +{ + return combine_count_; +} + +void App_Info::reset_combine_count() +{ + combine_count_ = 0; +} + + +int* App_Info::world_ranks(){ + return world_ranks_; +} + +void App_Info::world_ranks(int *ret_list) +{ + for(int i=0; i < app_size_; i++) + ret_list[i] = world_ranks_[i]; +} diff --git a/src/Panda/App_Info.h b/src/Panda/App_Info.h new file mode 100644 index 0000000..f5d9664 --- /dev/null +++ b/src/Panda/App_Info.h @@ -0,0 +1,31 @@ +#ifndef App_Info_dot_h +#define App_Info_dot_h + +#include "mpi.h" + +class App_Info { + int app_num_; + int app_size_; + int *world_ranks_; + int *relative_ranks_; + MPI_Comm *intra_comm_; + int combine_count_; + + public: + App_Info(int,int,int*); + virtual ~App_Info(); + int app_num(); + int app_size(); + int get_master(); + int world_rank(int); + int relative_rank(int); + void set_intra_comm(MPI_Comm *); + MPI_Comm* intra_comm(); + void inc_combine_count(); + int combine_count(); + void reset_combine_count(); + int *world_ranks(); + void world_ranks(int*); +}; + +#endif diff --git a/src/Panda/Array.C b/src/Panda/Array.C new file mode 100644 index 0000000..e2fd7eb --- /dev/null +++ b/src/Panda/Array.C @@ -0,0 +1,649 @@ +#include "definitions.h" +#include "MPIFS.h" +#include "Array.h" + +#include "external/IEEEIO/src/Arch.h" + +extern "C" { + int IOsizeOf(int); + int IOreadAttributeInfo(IOFile, char *,int *, int *); + int IOreadAttribute(IOFile,int,void*); +} + +extern int global_system_type_; +extern MPIFS* MPIFS_global_obj; +extern int SUBCHUNK_SIZE; + +/*************************************************************************** + * Class: Array + * Description: This is a user-visible class. This is used to describe the + * global array. It also stores pointers to local chunks of + * data. + * + * Instance-variables: + * name_ - name of the array + * rank_ - rank of the array (inherited variable) + * size_ - size of the array (elements) in the various dimensions + * element_size_ - size of each array element (in bytes) + * compute_node_layout_ - layout of the compute nodes + * io_node_layout_ - layout of the io nodes + * subchunk_layout_ - layout of the subchunks + * natural_chunked - whether the array is natural chunked + * compute_node_alloc_policy - chunk dist policy on compute nodes + * io_node_alloc_policy - chunk dist policy on the io nodes + **************************************************************************** + */ + +Array::Array() : Template() +{ + subchunk_layout_ = NULL; + element_size_ = 0; + natural_chunked_ = NO; + sub_chunked_ = NO; + overlap_ = NO; + io_strategy_ = SIMPLE_IO; +} + +/* This function is used on the compute nodes to create the array * + * object. In this case there is no user-specified subchunking and * + * the chunk distribution on the compute nodes is 1 chunk per * + * compute node and round-robin on the io nodes. */ +Array::Array(char *name, int rank, int *sizearray, int elt_size, + ArrayLayout *mem_layout, Distribution *mem_dist, + ArrayLayout *io_layout, Distribution *io_dist):Template(rank, sizearray) +{ + do_init(name, rank, sizearray, elt_size, mem_layout, mem_dist, + io_layout, io_dist, NULL, NULL, REGULAR, ROUND_ROBIN, HPF); + /* call function to allocate chunk_list */ + if (sizearray) allocate_chunks(COMPUTE_NODE); + overlap_ = NO; +} + +/* This function is used on the compute nodes to create the array * + * object. In this case there is no user-specified subchunking and * + * the chunk distribution on the compute nodes is 1 chunk per * + * compute node and round-robin on the io nodes. Also in this case* + * the user specifies the data ptr to be used. */ +Array::Array(char *name, int rank, int *sizearray, int elt_size, + ArrayLayout *mem_layout, Distribution *mem_dist, + ArrayLayout *io_layout, Distribution *io_dist, + char *data_ptr) : Template(rank, sizearray) +{ + char *ptr = data_ptr; + do_init(name, rank, sizearray, elt_size, mem_layout, mem_dist, + io_layout, io_dist, NULL, NULL, REGULAR, ROUND_ROBIN, HPF); + /* call function to allocate chunk_list */ + if (sizearray) allocate_chunks(COMPUTE_NODE, 1 , &ptr, 0); + overlap_ = NO; +} + +/* This function is used on the compute nodes to create the array * + * object. In this case there is no user-specified subchunking and * + * the chunk distribution on the compute nodes is 1 chunk per * + * compute node and round-robin on the io nodes. Also in this case* + * the user specifies the data ptr to be used and stencil width. */ +Array::Array(char *name, int rank, int *sizearray, int elt_size, + ArrayLayout *mem_layout, Distribution *mem_dist, + ArrayLayout *io_layout, Distribution *io_dist, + char *data_ptr, int stencil_width) : Template(rank, sizearray) +{ + char *ptr = data_ptr; + do_init(name, rank, sizearray, elt_size, mem_layout, mem_dist, + io_layout, io_dist, NULL, NULL, REGULAR, ROUND_ROBIN, HPF); + /* call function to allocate chunk_list */ + if (sizearray) allocate_chunks(COMPUTE_NODE, 1 , &ptr, stencil_width); + if (stencil_width > 0) overlap_ = YES; + else overlap_ = NO; +} + +/* This function is used on the compute nodes to create the array * + * object. In this case there is user-specified subchunking and * + * the chunk distribution on the compute nodes is 1 chunk per * + * compute node and round-robin on the io nodes. */ +Array::Array(char *name, int rank, int *sizearray, int elt_size, + ArrayLayout *mem_layout, Distribution *mem_dist, + ArrayLayout *io_layout, Distribution *io_dist, + ArrayLayout *sub_layout, Distribution* sub_dist) + : Template(rank, sizearray) +{ + do_init(name, rank, sizearray, elt_size, mem_layout, mem_dist, + io_layout, io_dist, sub_layout, sub_dist, + REGULAR, ROUND_ROBIN, HPF); + /* call function to allocate chunk_list */ + if (sizearray) allocate_chunks(COMPUTE_NODE); + overlap_ = NO; +} + +/* This function is used on the compute nodes to create the array * + * object. In this case there is user-specified subchunking and * + * the chunk distribution on the compute nodes is 1 chunk per * + * compute node and round-robin on the io nodes. This function is * + * used to when the user provides the data_ptr. */ +Array::Array(char *name, int rank, int *sizearray, int elt_size, + ArrayLayout *mem_layout, Distribution *mem_dist, + ArrayLayout *io_layout, Distribution *io_dist, + ArrayLayout *sub_layout, Distribution* sub_dist, + char *data_ptr) : Template(rank, sizearray) +{ + char *ptr = data_ptr; + do_init(name, rank, sizearray, elt_size, mem_layout, mem_dist, + io_layout, io_dist, sub_layout, sub_dist, + REGULAR, ROUND_ROBIN, HPF); + /* call function to allocate chunk_list */ + if (sizearray) allocate_chunks(COMPUTE_NODE, 1, &ptr, 0); + overlap_ = NO; +} + +/* Initializes the state of the array object. the chunks are allocated * + * via another function */ +void Array::do_init(char *name, int rank, int *sizearray, int elt_size, + ArrayLayout *mem_layout, Distribution *mem_dist, + ArrayLayout *io_layout, Distribution *io_dist, + ArrayLayout *subchunk_layout, Distribution *subchunk_dist, + ChunkAllocPolicy comp_node_policy, ChunkAllocPolicy io_node_policy, + Block_Distribution block_dist) +{ + io_strategy_ = SIMPLE_IO; + + name_ = (char *) malloc(sizeof(char)*(strlen(name)+5)); + strcpy(name_, name); + ieee_size_ = elt_size; + element_size_ = IOsizeOf(ieee_size_); + + compute_node_layout_ = new RegularDistribution(rank, mem_layout, mem_dist, + comp_node_policy, block_dist); + if (io_layout) + io_node_layout_ = new RegularDistribution(rank, io_layout, io_dist, + io_node_policy, block_dist); + else io_node_layout_ = NULL; + if (subchunk_layout) + subchunk_layout_ = new RegularDistribution(rank, subchunk_layout, + subchunk_dist, ROUND_ROBIN, + block_dist); + else subchunk_layout_ = NULL; + + /* Check if there is any sub-chunking */ + if (subchunk_layout_) sub_chunked_ = YES; + else sub_chunked_ = NO; + + /* Check if there is any natural chuunking */ + if (compute_node_layout_->equal(io_node_layout_)) natural_chunked_ = YES; + else natural_chunked_ = NO; +} + +/* This function is used to initialize the array objects on the io * + * node side. */ +Array::Array(int **schema_buf) +{ + int* ptr = *schema_buf; + + io_strategy_ = *ptr++; + op_type_ = *ptr++; + int len = *ptr++; + name_ = (char *) malloc(len+1); + for (int i=0; i< len; i++) name_[i] = (char) *ptr++; + name_[len] = '\0'; + rank_ = *ptr++; + + if (*ptr++ > 0) { + size_ = (int *) malloc(sizeof(int) * rank_); + for(int i=0; i < rank_; i++) size_[i] = *ptr++; + } else size_ = NULL; + + element_size_ = *ptr++; + ieee_size_ = *ptr++; + natural_chunked_ = (Boolean) *ptr++; + sub_chunked_ = (Boolean) *ptr++; + overlap_ = (Boolean) *ptr++; + + compute_node_layout_ = unpack_layout(&ptr); + io_node_layout_ = unpack_layout(&ptr); + + if (sub_chunked_) subchunk_layout_ = unpack_layout(&ptr); + else subchunk_layout_ = NULL; + + *schema_buf = ptr; +} + +ArrayDistribution *Array::unpack_layout(int **schema_buf) +{ + int *ptr = *schema_buf; + int type = *ptr++; + ArrayDistribution *tmp; + + if (type == UNSET) tmp = NULL; + else if (type == Regular) tmp = new RegularDistribution(&ptr); + else if (type == Irregular) {printf("Irregular is not supported\n"); exit(0);} + else tmp = NULL; + + *schema_buf = ptr; + return tmp; +} + +/* Allocate chunks - Currently only used on the compute node side */ +void Array::allocate_chunks(int node_type) +{ + int my_rank; + Chunk *new_chunk; + + if (node_type == COMPUTE_NODE) { + /* First find out what kind of system we have (MPI or sequential) */ + if (global_system_type_ == MPI_SYSTEM) { + /* Allocate a single chunk with index=compute_node_rank */ + my_rank = MPIFS_global_obj->my_rank(COMPUTE_NODE); + new_chunk = new Chunk(this, my_rank, COMPUTE_NODE, ALLOC); + compute_node_layout_->add_last(new_chunk); + } else if (global_system_type_ == UNIX_SYSTEM) { + /* There is only one kind of Allocation policy */ + int num = compute_node_layout_->total_elements(); + for (my_rank=0; my_rank<num; my_rank++) { + new_chunk = new Chunk(this, my_rank, COMPUTE_NODE, ALLOC); + compute_node_layout_->add_last(new_chunk); + } + } else printf("Unsupported filesystem\n"); + } else if (node_type == IO_NODE) { + printf("Will have to do this later\n"); + } else { + printf("Error: Don't know the node type\n"); + } +} + +/* Allocate chunks with user-specified data pointer. This function + * currently supports only the REGULAR distribution of chunks in + * the MPI-based file system and + * should be called only on the compute node side + */ +void Array::allocate_chunks(int node_type, int num_ptrs, + char **data_ptr, int stencil_width) +{ + int my_rank; + Chunk *new_chunk; + + if (node_type == COMPUTE_NODE) { + /* First find out what kind of system we have (MPI or sequential) */ + if (global_system_type_ == MPI_SYSTEM) { + /* Allocate a single chunk with index=compute_node_rank */ + my_rank = MPIFS_global_obj->my_rank(COMPUTE_NODE); + new_chunk = new Chunk(this, my_rank, COMPUTE_NODE, NO_ALLOC); + new_chunk->set_data_ptr(data_ptr[0]); + new_chunk->set_stencil_width(stencil_width); + compute_node_layout_->add_last(new_chunk); + } else if (global_system_type_ == UNIX_SYSTEM) { + /* There is only one kind of Allocation policy */ + int num = compute_node_layout_->total_elements(); + for (my_rank=0; my_rank<num; my_rank++) { + new_chunk = new Chunk(this, my_rank, COMPUTE_NODE, NO_ALLOC); + new_chunk->set_data_ptr(data_ptr[my_rank]); + new_chunk->set_stencil_width(stencil_width); + compute_node_layout_->add_last(new_chunk); + } + } else printf("Unsupported filesystem\n"); + } else if (node_type == IO_NODE) { + printf("Will have to do this later\n"); + } else { + printf("Error: Don't know the node type\n"); + } +} + +Array::~Array() +{ + if (name_) free(name_); + name_ = NULL; + if (compute_node_layout_) delete(compute_node_layout_); + if (io_node_layout_) delete(io_node_layout_); + if (subchunk_layout_) delete(subchunk_layout_); + compute_node_layout_ = io_node_layout_ = subchunk_layout_ = NULL; +} + +/* We are not packing the chunk information here */ +void Array::pack(int** schema_buf, int *schema_size) +{ + int *ptr, *head; + int i, len; + + ptr = (int *) malloc(sizeof(int)*100); + head = ptr; + + *ptr++ = io_strategy_; + *ptr++ = op_type_; + len = strlen(name_); + *ptr++ = len; + for(i=0; i<len;i++) *ptr++ = (int) name_[i]; + *ptr++ = rank_; + if (size_) { *ptr++ = 1; for(int i=0; i < rank_;i++) *ptr++ = size_[i]; } + else *ptr++ = 0; + *ptr++ = element_size_; + *ptr++ = ieee_size_; + *ptr++ = (int)natural_chunked_; + *ptr++ = (int)sub_chunked_; + *ptr++ = (int)overlap_; + + if (compute_node_layout_) compute_node_layout_->pack(&ptr); + else *ptr++ = (int)UNSET; + if (io_node_layout_) io_node_layout_->pack(&ptr); + else *ptr++ = (int)UNSET; + if (sub_chunked_) subchunk_layout_->pack(&ptr); + + *schema_size = (int)(ptr - head); + *schema_buf = head; +} + +ArrayDistribution* Array::layout(int layout_type) +{ + switch(layout_type) { + case COMPUTE_NODE: + return compute_node_layout_; + case IO_NODE: + return io_node_layout_; + case SUB_CHUNK: + return subchunk_layout_; + default: + printf("Invalid type\n"); + return NULL; + } +} + +/* The following two functions are used for regular layouts (HPF-style) only */ +/* Given a chunk index and node type, this function returns the * + * the relative node number on which the chunk resides */ +int Array::which_node(int chunk_id, int node_type) +{ + if (node_type == COMPUTE_NODE) + if (compute_node_layout_->alloc_policy() == REGULAR) return chunk_id; + else { + printf("Unsupported chunk alloc type\n"); + exit(1); + } + else if (node_type == IO_NODE) { + printf("Currently this is unsupported\n"); + exit(1); + } else { + printf("Unsupported node type\n"); + exit(1); + } + return -1; +} + +int Array::which_node(int chunk_id, int node_type, int num_io_nodes) +{ + if (node_type == IO_NODE){ + switch(io_node_layout_->alloc_policy()){ + case ROUND_ROBIN: + return(chunk_id % num_io_nodes); + + default: + printf("Error in which_node(int,int,int).. Invalid distribution type\n"); + exit(1); + } + } else if (node_type == COMPUTE_NODE) { + switch(compute_node_layout_->alloc_policy()){ + case REGULAR: + return chunk_id; + + default: + printf("Error in which_node(int,int,int)... Invalid distribution type\n"); + exit(1); + } + } else { + printf("Error in which_node(int,int,int)... Invalid node type\n"); + exit(1); + } + return -1; +} + +Chunk* Array::get_next_chunk() +{ + return compute_node_layout_->get_next_chunk(); +} + +/* The following seven functions are called by compute nodes only */ +/* Given a chunk index, find the chunk */ +Chunk* Array::find_chunk(int id) +{ + List *list = compute_node_layout_->chunk_list(); + Cell *list_ptr = list != NULL ? list->head_: NULL; + Chunk *chunk_ptr; + + while (list_ptr) { + chunk_ptr = (Chunk *)list_ptr->item(); + if (chunk_ptr->chunk_id() == id) return chunk_ptr; + list_ptr = list_ptr->next(); + } + return NULL; +} + +int Array::element_size(){return element_size_;} +int Array::ieee_size(){return ieee_size_;} + +Boolean Array::nat_chunked(){return natural_chunked_;} + +Boolean Array::sub_chunked(){return sub_chunked_;} + +/* This function needs to be checked and refined */ +void Array::make_sub_chunks(Chunk *chunk) +{ + Distribution *subchunk_dist; + int *subchunk_layout_sizes; + int i, tmp_size, dim, val_dim; + int *chunk_size = chunk->size(); + + if (sub_chunked_) { + printf("Error: Array already subchunked\n"); + exit(1); + } else { + subchunk_dist = (Distribution *) malloc(sizeof(Distribution)*rank_); + subchunk_layout_sizes = (int*) malloc(sizeof(int)*rank_); + tmp_size = chunk->total_size_in_bytes(); + if (tmp_size < SUBCHUNK_SIZE){ + for(i=0;i<rank_;i++){ + subchunk_dist[i] = BLOCK; + subchunk_layout_sizes[i] = 1; + } + } else { + tmp_size = element_size_; + i = rank_; + while(tmp_size < SUBCHUNK_SIZE){ + i--; + tmp_size *= chunk_size[i]; + } + dim =i; + tmp_size /=chunk_size[i]; + val_dim = SUBCHUNK_SIZE / tmp_size; + for(i=0;i<dim;i++){ + subchunk_dist[i] = BLOCK; + subchunk_layout_sizes[i] = chunk_size[i]; + } + subchunk_dist[dim] = BLOCK; + subchunk_layout_sizes[dim] = (chunk_size[i] + val_dim -1)/val_dim; + for(i=dim+1;i<rank_; i++){ + subchunk_dist[i] = BLOCK; + subchunk_layout_sizes[i] = 1; + } + } + ArrayLayout *tmp_layout = new ArrayLayout(rank_, subchunk_layout_sizes); + subchunk_layout_ = new RegularDistribution(rank_, tmp_layout, + subchunk_dist, ROUND_ROBIN, HPF); + sub_chunked_ = YES; + free(subchunk_layout_sizes); + free(subchunk_dist); + } +} + +int Array::array_info() +{ + List *list = compute_node_layout_->chunk_list(); + Cell *list_ptr = list->head_; + Chunk *chunk_ptr; + int ret =0; + + while(list_ptr) { + chunk_ptr = (Chunk *)list_ptr->item(); + ret += chunk_ptr->total_size_in_bytes(); + list_ptr = list_ptr->next(); + } + return ret; +} + +/* Called only on the I/O node side */ +int Array::get_next_index(Chunk *&chunk, int old_val, int io_node_num, + int num_io_nodes, int max) +{ + int ret = io_node_layout_->get_next_index(chunk, old_val, io_node_num, + num_io_nodes, max); + if (io_node_layout_->distribution_type() == Regular) + if (ret < max) chunk->init(this, ret, IO_NODE, NO_ALLOC); + return ret; +} + +/* This function should be called only on the compute node side and * + * make sense only for the regular distribution of chunks, */ +void Array::set_data_ptr(char *data_ptr) +{ + List *list = compute_node_layout_->chunk_list(); + Chunk *chunk_ptr; + + if (list && list->head_){ + chunk_ptr = (Chunk *) list->head_->item(); + chunk_ptr->set_data_ptr(data_ptr); + } else { + printf("Error: No chunks present - cannot set data ptr\n"); + } +} + + +/* This function should be called only on the compute node side and * + * make sense only for the regular distribution of chunks, */ +char* Array::get_data_ptr() +{ + List *list = compute_node_layout_->chunk_list(); + Chunk *chunk_ptr; + + if (list && list->head_){ + chunk_ptr = (Chunk *) list->head_->item(); + return ((char *)chunk_ptr->data_ptr()); + } else { + printf("Error: No chunks present - cannot set data ptr\n"); + return NULL; + } +} + +Boolean Array::overlaped() +{ + return overlap_; +} + +void Array::read_schema_file(IOFile file_ptr) +{ + int *base = (int *)malloc(sizeof(int) * rank_); + int *size = (int *)malloc(sizeof(int) * rank_); + int index, length, datatype; + Chunk *new_chunk; + + index = IOreadAttributeInfo(file_ptr, "chunk_origin", &datatype, &length); + if (index >=0 ) { // the attribute exists + IOreadAttribute(file_ptr, index, base); + index = IOreadAttributeInfo(file_ptr, "chunk_size",&datatype,&length); + if (index < 0) { printf("Error in reading attributes\n"); exit(0); } + IOreadAttribute(file_ptr, index, size); + new_chunk = new Chunk(this, base, size); + } else { + for (int j=0; j<rank_; j++) base[j] = 0; + new_chunk = new Chunk(this, base, size_); + } + io_node_layout_ = new IrregularDistribution(1, &new_chunk); + free(base); + free(size); +} + +/* The collective io operation to write out the arrays. */ +void Array::timestep() +{ + int *schema, schema_size; + int node_type = MPIFS_global_obj->node_type(); + + op_type_ = TIMESTEP; + if (node_type == COMPUTE_NODE){ + MPIFS_global_obj->send_array_schema(this); + MPIFS_global_obj->compute_node_io_loop(this); + } + else if (node_type == PART_TIME_COMPUTE) + MPIFS_global_obj->compute_node_io_loop(this); + else { + pack(&schema, &schema_size); + MPIFS_global_obj->part_time_io_node_loop(schema, schema_size, this); + } +} + +/* The collective io operation to write out the arrays. */ +void Array::checkpoint() +{ + int *schema, schema_size; + int node_type = MPIFS_global_obj->node_type(); + + op_type_ = CHECKPOINT; + if (node_type == COMPUTE_NODE){ + MPIFS_global_obj->send_array_schema(this); + MPIFS_global_obj->compute_node_io_loop(this); + } + else if (node_type == PART_TIME_COMPUTE) + MPIFS_global_obj->compute_node_io_loop(this); + else { + pack(&schema, &schema_size); + MPIFS_global_obj->part_time_io_node_loop(schema, schema_size, this); + } +} + +/* The collective io operation to read in the arrays from a * + * checkpoint file. Currently (for testing purposes) this * + * does not happen. */ +void Array::restart() +{ + int *schema, schema_size; + int node_type = MPIFS_global_obj->node_type(); + + op_type_ = RESTART; + if (node_type == COMPUTE_NODE){ + MPIFS_global_obj->send_array_schema(this); + MPIFS_global_obj->compute_node_io_loop(this); + } + else if (node_type == PART_TIME_COMPUTE) + MPIFS_global_obj->compute_node_io_loop(this); + else { + pack(&schema, &schema_size); + MPIFS_global_obj->part_time_io_node_loop(schema, schema_size , this); + } +} + +void Array::read_timestep() +{ + int *schema, schema_size; + int node_type = MPIFS_global_obj->node_type(); + + op_type_ = READ_TIMESTEP; + if (node_type == COMPUTE_NODE){ + MPIFS_global_obj->send_array_schema(this); + MPIFS_global_obj->compute_node_io_loop(this); + } + else if (node_type == PART_TIME_COMPUTE) + MPIFS_global_obj->compute_node_io_loop(this); + else { + pack(&schema, &schema_size); + MPIFS_global_obj->part_time_io_node_loop(schema, schema_size , this); + } +} + +int Array::op_type() { return op_type_; } +int Array::io_strategy() { return io_strategy_; } + +void Array::init(int rank, int ieee_size, int *size, int node_type) +{ + rank_ = rank; + ieee_size_ = ieee_size; + element_size_ = IOsizeOf(ieee_size_); + size_ = size; + if (node_type == COMPUTE_NODE) { + int my_rank = MPIFS_global_obj->my_rank(COMPUTE_NODE); + Chunk *new_chunk = new Chunk(this, my_rank, COMPUTE_NODE, ALLOC); + compute_node_layout_->add_last(new_chunk); + } +} diff --git a/src/Panda/Array.h b/src/Panda/Array.h new file mode 100644 index 0000000..834fd36 --- /dev/null +++ b/src/Panda/Array.h @@ -0,0 +1,88 @@ +#ifndef Array_dot_h +#define Array_dot_h + +#include "List.h" +#include "ArrayDistribution.h" +#include "Chunk.h" + +#include "external/IEEEIO/src/Arch.h" + +//#include "../IEEEIO/IEEEIO.h" +//#include "../IEEEIO/IOProtos.h" + + +class Array : public Template, public Linkable { + protected: + ArrayDistribution *compute_node_layout_; + ArrayDistribution *io_node_layout_; + ArrayDistribution *subchunk_layout_; + int element_size_; + int ieee_size_; + char *name_; + Boolean natural_chunked_; + Boolean sub_chunked_; + Boolean overlap_; + int op_type_; + int io_strategy_; + + void do_init(char*, int, int*, int, + ArrayLayout*, Distribution*, + ArrayLayout*, Distribution*, + ArrayLayout*, Distribution*, + ChunkAllocPolicy, ChunkAllocPolicy, + Block_Distribution); + void allocate_chunks(int); + void allocate_chunks(int,int,char**,int); + ArrayDistribution *unpack_layout(int **); + + public: + Array(char*,int, int*, int, ArrayLayout*, + Distribution*, ArrayLayout*, + Distribution*); + Array(char*,int, int*, int, ArrayLayout*, + Distribution*, ArrayLayout*, + Distribution*, char *); + Array(char*,int, int*, int, ArrayLayout*, + Distribution*, ArrayLayout*, + Distribution*, ArrayLayout*, + Distribution*); + Array(char*,int, int*, int, ArrayLayout*, + Distribution*, ArrayLayout*, + Distribution*, ArrayLayout*, + Distribution*, char *); + Array(char*,int, int*, int, + ArrayLayout*, Distribution*, + ArrayLayout*, Distribution*, char *, int); + Array(int **); + Array(); + virtual ~Array(); + void init(int,int,int*,int); + Chunk* get_next_chunk(); + int which_node(int,int,int); + void delete_chunks(); + void pack(int**, int*); + ArrayDistribution* layout(int); + int which_node(int,int); + Chunk* find_chunk(int); + int element_size(); + int ieee_size(); + Boolean nat_chunked(); + Boolean sub_chunked(); + void make_sub_chunks(Chunk*); + int array_info(); + int get_next_index(Chunk*&,int,int,int,int); + int num_of_chunks(); + void set_data_ptr(char *); + char* get_data_ptr(); + Boolean overlaped(); + void read_schema_file(IOFile); + + void timestep(); + void read_timestep(); + void checkpoint(); + void restart(); + int op_type(); + int io_strategy(); +}; + +#endif diff --git a/src/Panda/ArrayDistribution.C b/src/Panda/ArrayDistribution.C new file mode 100644 index 0000000..04e5226 --- /dev/null +++ b/src/Panda/ArrayDistribution.C @@ -0,0 +1,205 @@ +#include "ArrayDistribution.h" + +/******************************** + * ArrayDistribution * + ********************************/ +Boolean ArrayDistribution::equal(ArrayDistribution *) { return NO; } + +int ArrayDistribution::distribution_type() +{ + printf("In ArrayDistributon: distribution_type, shouldn't be called\n"); + return -1; +} + +ArrayDistribution::ArrayDistribution() +{ + num_of_chunks_ = 0; + chunk_list_ = new List(); + current_cell_ = NULL; +} + +ArrayDistribution::ArrayDistribution(int **schema_buf) +{ + printf("In ArrayDistributon: init, shouldn't be called\n"); +} + +void ArrayDistribution::add_last(Chunk *new_chunk) +{ + chunk_list_->add_last(new_chunk); + num_of_chunks_++; +} + +ArrayDistribution::~ArrayDistribution() +{ + Cell *list_ptr; + if (chunk_list_) { + list_ptr = chunk_list_->head_; + while (list_ptr) { + delete list_ptr->item(); + list_ptr = list_ptr->next(); + } + delete chunk_list_; + chunk_list_ = NULL; num_of_chunks_ = 0; + } +} + +void ArrayDistribution::pack(int **schema_buf) +{ + printf("In ArrayDistributon: pack, shouldn't be called\n"); +} + +int ArrayDistribution::get_next_index(Chunk *&chunk, int old_val, + int io_node_num, + int num_io_nodes, int max) +{ + printf("In ArrayDistributon: get_next_index shouldn't be called\n"); + return max; +} + +List *ArrayDistribution::chunk_list() +{ + return chunk_list_; +} + +Chunk* ArrayDistribution::get_next_chunk() +{ + if (current_cell_) current_cell_ = current_cell_->next(); + else current_cell_ = chunk_list_->head_; + + if (current_cell_) return ((Chunk *)current_cell_->item()); + return NULL; +} + +int ArrayDistribution::total_elements() +{ + printf("In ArrayDistributon: total_elements shouldn't be called\n"); + return 0; +} + +ChunkAllocPolicy ArrayDistribution::alloc_policy() +{ + printf("In ArrayDistributon: alloc_policy, shouldn't be called\n"); + return ROUND_ROBIN; +} + +void ArrayDistribution::list_clear() { current_cell_ = NULL; } + +/******************************** + * RegularDistribution * + ********************************/ +RegularDistribution::RegularDistribution(int rank, ArrayLayout *layout, + Distribution *dist, + ChunkAllocPolicy alloc_policy, + Block_Distribution block_dist) + : ArrayDistribution() +{ + layout_ = new ArrayLayout(layout); + rank_ = rank; + alloc_policy_ = alloc_policy; + dist_ = copy_distribution(rank_, dist); + block_dist_ = block_dist; +} + +RegularDistribution::RegularDistribution(int **schema_buf) : ArrayDistribution() +{ + int* ptr = *schema_buf; + layout_ = new ArrayLayout(&ptr); + rank_ = *ptr++; + dist_ = new_distribution(&ptr, rank_); + alloc_policy_ = (ChunkAllocPolicy)*ptr++; + block_dist_ = (Block_Distribution)*ptr++; + *schema_buf = ptr; +} + +RegularDistribution::~RegularDistribution() +{ + if (layout_) { delete layout_; layout_ = NULL; } + if (dist_ ) { free(dist_); dist_ = NULL; } +} + +Boolean RegularDistribution::equal(ArrayDistribution *that) +{ + if (!that) return NO; + + RegularDistribution *tmp; + if (that->distribution_type() == Regular) + tmp = (RegularDistribution *)that; + else return NO; + + if (layout_->equal(tmp->layout_) && + equal_distribution(rank_, dist_, tmp->dist_)) return YES; + return NO; +} + +ArrayLayout *RegularDistribution::layout() +{ + return layout_; +} + +Distribution *RegularDistribution::distribution() +{ + return dist_; +} + +void RegularDistribution::pack(int **schema_buf) +{ + int* ptr = *schema_buf; + + *ptr++ = (int)Regular; + layout_->pack(&ptr); + *ptr++ = rank_; + pack_distribution(&ptr, rank_, dist_); + *ptr++ = (int)alloc_policy_; + *ptr++ = block_dist_; + *schema_buf = ptr; +} + +int RegularDistribution::distribution_type() +{ + return Regular; +} + +int RegularDistribution::total_elements() +{ + return layout_->total_elements(); +} + +ChunkAllocPolicy RegularDistribution::alloc_policy() { return alloc_policy_; } + +int RegularDistribution::get_next_index(Chunk *&chunk, int old_val, + int io_node_num, + int num_io_nodes, int max) +{ + if (old_val == -1) return io_node_num; + else return (old_val + num_io_nodes); +} + +Block_Distribution RegularDistribution::block_dist() { return block_dist_; } + +/******************************** + * IrregularDistribution * + ********************************/ +int IrregularDistribution::distribution_type() +{ + return Irregular; +} + +int IrregularDistribution::total_elements() +{ + return num_of_chunks_; +} + +int IrregularDistribution::get_next_index(Chunk *&chunk, + int old_val, int io_node_num, + int num_io_nodes, int max) +{ + chunk = get_next_chunk(); + if (chunk == NULL) return max; + return chunk->chunk_id(); +} + +IrregularDistribution::IrregularDistribution(int num, Chunk **chunk_list) + : ArrayDistribution() +{ + for (int i=0; i<num; i++) add_last(chunk_list[i]); +} diff --git a/src/Panda/ArrayDistribution.h b/src/Panda/ArrayDistribution.h new file mode 100644 index 0000000..12e68e1 --- /dev/null +++ b/src/Panda/ArrayDistribution.h @@ -0,0 +1,70 @@ +#ifndef ArrayDistribution_dot_h +#define ArrayDistribution_dot_h + +#include "definitions.h" +#include "List.h" +#include "ArrayLayout.h" +#include "Chunk.h" + +class Array; +class ArrayDistribution +{ +protected: + int num_of_chunks_; + List *chunk_list_; + Cell *current_cell_; +public: + ArrayDistribution(); + ArrayDistribution(int **); + virtual ~ArrayDistribution(); + virtual Boolean equal(ArrayDistribution *); + virtual int distribution_type(); + virtual void pack(int **); + virtual int total_elements(); + virtual ChunkAllocPolicy alloc_policy(); + virtual int get_next_index(Chunk *&,int,int,int,int); + List *chunk_list(); + void add_last(Chunk *); + Chunk *get_next_chunk(); + void list_clear(); +}; + + +class RegularDistribution : public ArrayDistribution +{ + ArrayLayout *layout_; + int rank_; + Distribution *dist_; + Block_Distribution block_dist_; + ChunkAllocPolicy alloc_policy_; +public: + RegularDistribution(int **); + RegularDistribution(int , ArrayLayout *, + Distribution *, ChunkAllocPolicy, + Block_Distribution, int*); + RegularDistribution(int , ArrayLayout *, + Distribution *, ChunkAllocPolicy, + Block_Distribution); + ~RegularDistribution(); + Boolean equal(ArrayDistribution *); + ArrayLayout *layout(); + Distribution *distribution(); + int distribution_type(); + void pack(int **); + int total_elements(); + ChunkAllocPolicy alloc_policy(); + int get_next_index(Chunk *&,int,int,int,int); + Block_Distribution block_dist(); +}; + +class IrregularDistribution : public ArrayDistribution +{ +public: + IrregularDistribution(int, Chunk **); + int distribution_type(); + int total_elements(); + int get_next_index(Chunk *&,int,int,int,int); +}; + +#endif + diff --git a/src/Panda/ArrayGroup.C b/src/Panda/ArrayGroup.C new file mode 100644 index 0000000..afba023 --- /dev/null +++ b/src/Panda/ArrayGroup.C @@ -0,0 +1,521 @@ +#include "definitions.h" +#include "MPIFS.h" +#include "Array.h" +#include "ArrayGroup.h" + +extern MPIFS *MPIFS_global_obj; + +ArrayGroup::ArrayGroup() +{ + do_init(); +} + +ArrayGroup::ArrayGroup(char *name) +{ + do_init(); + name_ = (char *)malloc(strlen(name)+1); + strcpy(name_, name); +} + + +/* Function to initialize the state of the newly created object */ +void ArrayGroup::do_init() +{ + num_of_arrays_ = 0; + list_ = new List(); + io_strategy_ = SIMPLE_IO; + interleaved_ = NO; + common_layouts_ = NO; + common_layout_rank_ = 0; + compute_layout_ = NULL; + compute_distribution_ = NULL; + io_layout_ = NULL; + io_distribution_ = NULL; + group_io_count_ = 0; + read_io_count_ =0; + checkpoint_count_ = 1; + simulate_ = NO; + verify_ = NO; + name_ = NULL; +} + +void ArrayGroup::clear() +{ + if (name_) free(name_); + if (compute_layout_ != NULL) delete compute_layout_; + if (compute_distribution_ != NULL) delete compute_distribution_; + if (io_layout_ != NULL) delete io_layout_; + if (io_distribution_ != NULL) delete io_distribution_; + if (list_) delete list_; + name_ = NULL; + compute_layout_ = NULL; compute_distribution_ = NULL; + io_layout_ = NULL; io_distribution_ = NULL; + list_ = new List(); +} + +/* Destructor function - Note that we don't have to delete the * + * arrays in the arraygroup over here. The arrays are deleted * + * by the user */ +ArrayGroup::~ArrayGroup() +{ + if (name_) free(name_); + if (compute_layout_ != NULL) delete compute_layout_; + if (compute_distribution_ != NULL) delete compute_distribution_; + if (io_layout_ != NULL) delete io_layout_; + if (io_distribution_ != NULL) delete io_distribution_; + if (list_) delete list_; + name_ = NULL; + compute_layout_ = NULL; + io_layout_ = NULL; + compute_distribution_ = NULL; + io_distribution_ = NULL; + list_ = NULL; +} + +/* Function to delete the arrays in the arraygroup. This is used * + * on the io node side to delete the arrays after the collective * + * io operation has been completed. On the compute node side, the* + * user explicitly deletes the arrays */ +void ArrayGroup::delete_arrays() +{ + Cell* list_ptr = (list_ != NULL? list_->head_: NULL); + Array* array_ptr; + + while(list_ptr) + { + array_ptr = (Array *) list_ptr->item(); + delete array_ptr; + list_ptr = list_ptr->next(); + } + if (list_) delete list_; + list_ = NULL; +} + +/* Assign id numbers to the arrays in the arraygroup. This function * + * must be called at the start of each collective i/o operation. */ +void ArrayGroup::assign_id() +{ + Cell* list_ptr = (list_ != NULL? list_->head_: NULL); + Array* array_ptr; + int i=0; + + while(list_ptr) + { + array_ptr = (Array *) list_ptr->item(); + array_ptr->set_array_id(i); +#ifdef DEBUG + printf("Assigned Id %d\n", i); +#endif + i++; + list_ptr = list_ptr->next(); + } +} + +/* Insert a new array. Before inserting the array, check to * + * see if it has a common layout with the rest of the arrays */ +void ArrayGroup::insert(Array *new_array) +{ + num_of_arrays_++; + + /* Has common layouts since this is the first array */ + if (num_of_arrays_ == 1) + { + common_layout_rank_ = new_array->rank(); + compute_layout_ = new ArrayLayout(new_array->layout(COMPUTE_NODE)); + compute_distribution_ = copy_distribution(common_layout_rank_, + new_array->distribution(COMPUTE_NODE)); + io_layout_ = new ArrayLayout(new_array->layout(IO_NODE)); + io_distribution_ = copy_distribution(common_layout_rank_, + new_array->distribution(IO_NODE)); + common_layouts_ = YES; + } + else if (common_layouts_) + { + /* check to see if the array has the same layouts/dist */ + if ((common_layout_rank_ == new_array->rank()) && + (compute_layout_->equal(new_array->layout(COMPUTE_NODE))) && + (io_layout_->equal(new_array->layout(IO_NODE))) && + (equal_distribution(common_layout_rank_, compute_distribution_, + new_array->distribution(COMPUTE_NODE))) && + (equal_distribution(common_layout_rank_, io_distribution_, + new_array->distribution(IO_NODE)))) + { + common_layouts_ = YES; + } + else + { + common_layouts_ = NO; + if (io_layout_) delete io_layout_; + if (compute_layout_) delete compute_layout_; + io_layout_ = compute_layout_ = NULL; + if (io_distribution_) free(io_distribution_); + if (compute_distribution_) free(compute_distribution_); + io_distribution_ = compute_distribution_ = NULL; + } + } + + list_->add_last(new_array); +} + +/* This function is called on the compute node side at the start * + * of each collective io operation. The information is packed * + * into an integer buffer. An assumption is made that the a buf * + * of 100 ints is sufficent for each array. */ +void ArrayGroup::pack(int** schema, int* schema_size) +{ + int *ptr, *head; + int i, len; + + /* Assuming that schema size of Array is ~= 100 elts */ + ptr = (int *) malloc(sizeof(int)*100*(num_of_arrays_+1)); + head = ptr; + + /* Round about way and space inefficent way of storing a name */ + *ptr++ = io_strategy_; + len = strlen(name_); + *ptr++ = len; + for(i=0; i<len;i++) + *ptr++ = (int) name_[i]; + + *ptr++ = num_of_arrays_; + *ptr++ = (int) interleaved_; + *ptr++ = (int) simulate_; + *ptr++ = (int) verify_; + *ptr++ = (int) common_layouts_; + if (common_layouts_) + { + *ptr++ = common_layout_rank_ ; + compute_layout_->pack(&ptr); + pack_distribution(&ptr, common_layout_rank_, compute_distribution_); + io_layout_->pack(&ptr); + pack_distribution(&ptr, common_layout_rank_, io_distribution_); + } + *ptr++ = group_io_count_; + *ptr++ = checkpoint_count_; + *ptr++ = op_type_; + + pack_arrays(&ptr, common_layouts_); + + *schema_size = (int)(ptr - head); + *schema = head; +} + +/* This function is called on the I/O node side. After receiving * + * the collective io schema, the information is unpacked. The * + * arrays are unpacked seperately via a another function call */ +void ArrayGroup::unpack(int **schema_ptr) +{ + int *ptr = *schema_ptr; + int len; + + /* Unpack the name */ + io_strategy_ = *ptr++; + len = *ptr++; + name_ = (char *) malloc(len+1); + for(int i=0; i< len; i++) + name_[i] = (char) *ptr++; + name_[len] = '\0'; + + num_of_arrays_ = *ptr++; + interleaved_ = (Boolean) *ptr++; + simulate_ = (Boolean) *ptr++; + verify_ = (Boolean) *ptr++; + common_layouts_ = (Boolean) *ptr++; + if (common_layouts_) + { + common_layout_rank_ = *ptr++; + compute_layout_ = new ArrayLayout(&ptr); + compute_distribution_ = new_distribution(&ptr, common_layout_rank_); + io_layout_ = new ArrayLayout(&ptr); + io_distribution_ = new_distribution(&ptr, common_layout_rank_); + } + else + { + common_layout_rank_ = 0; + compute_layout_ = io_layout_ = NULL; + compute_distribution_ = io_distribution_ = NULL; + } + group_io_count_ = *ptr++; + checkpoint_count_ = *ptr++; + op_type_ = *ptr++; + + /* Arrays are being unpacked seperately */ + *schema_ptr = ptr; +} + +void ArrayGroup::unpack_arrays(int **schema_buf) +{ + Array *array; + int i, *ptr = *schema_buf; + + if (common_layouts_){ + for(i=0;i<num_of_arrays_;i++){ + array = new Array(&ptr, common_layouts_, compute_layout_, + compute_distribution_, io_layout_, + io_distribution_); + list_->add_last(array); + } + } else { + for(i=0;i<num_of_arrays_;i++){ + array = new Array(&ptr, common_layouts_); + list_->add_last(array); + } + } + *schema_buf = ptr; +} + + +/* The collective io operation to write out the arrays. */ +void ArrayGroup::timestep() +{ + int *schema, schema_size; + int node_type = MPIFS_global_obj->node_type(); + /* Assign id numbers to each array */ + assign_id(); + + op_type_ = TIMESTEP; + if (node_type == COMPUTE_NODE){ + MPIFS_global_obj->send_group_schema(this); + MPIFS_global_obj->compute_node_io_loop(this); + } + else if (node_type == PART_TIME_COMPUTE) + MPIFS_global_obj->compute_node_io_loop(this); + else { + pack(&schema, &schema_size); + MPIFS_global_obj->part_time_io_node_loop(schema, schema_size, this); + } + /* Commented out for testing purposes */ +// group_io_count_++; +} + +/* The collective io operation to write out the arrays. */ +void ArrayGroup::general_write() +{ + int *schema, schema_size; + int node_type = MPIFS_global_obj->node_type(); + /* Assign id numbers to each array */ + assign_id(); + + op_type_ = GENERAL_WRITE; + if (node_type == COMPUTE_NODE){ + MPIFS_global_obj->send_group_schema(this); + MPIFS_global_obj->compute_node_io_loop(this); + } + else if (node_type == PART_TIME_COMPUTE) + MPIFS_global_obj->compute_node_io_loop(this); + else { + pack(&schema, &schema_size); + MPIFS_global_obj->part_time_io_node_loop(schema, schema_size, this); + } + /* Commented out for testing purposes */ +// group_io_count_++; +} + + +/* The collective io operation to write out the arrays. */ +void ArrayGroup::checkpoint() +{ + int *schema, schema_size; + int node_type = MPIFS_global_obj->node_type(); + /* Assign id numbers to each array */ + assign_id(); + + if (checkpoint_count_ == 0) + checkpoint_count_ = 1; + else + checkpoint_count_ = 0; + + op_type_ = CHECKPOINT; + if (node_type == COMPUTE_NODE){ + MPIFS_global_obj->send_group_schema(this); + MPIFS_global_obj->compute_node_io_loop(this); + } + else if (node_type == PART_TIME_COMPUTE) + MPIFS_global_obj->compute_node_io_loop(this); + else { + pack(&schema, &schema_size); + MPIFS_global_obj->part_time_io_node_loop(schema, schema_size, this); + } + +} + + + +/* The collective io operation to read in the arrays from a * + * checkpoint file. Currently (for testing purposes) this * + * does not happen. */ +void ArrayGroup::restart() +{ + int *schema, schema_size; + int node_type = MPIFS_global_obj->node_type(); + + /* Assign id numbers to each array */ + assign_id(); + + op_type_ = RESTART; + if (node_type == COMPUTE_NODE){ + MPIFS_global_obj->send_group_schema(this); + MPIFS_global_obj->compute_node_io_loop(this); + } + else if (node_type == PART_TIME_COMPUTE) + MPIFS_global_obj->compute_node_io_loop(this); + else { + pack(&schema, &schema_size); + MPIFS_global_obj->part_time_io_node_loop(schema, schema_size , this); + } +} + + +void ArrayGroup::read_timestep() +{ + int *schema, schema_size; + int node_type = MPIFS_global_obj->node_type(); + + /* Assign id numbers to each array */ + assign_id(); + + op_type_ = READ_TIMESTEP; + if (node_type == COMPUTE_NODE){ + MPIFS_global_obj->send_group_schema(this); + MPIFS_global_obj->compute_node_io_loop(this); + } + else if (node_type == PART_TIME_COMPUTE) + MPIFS_global_obj->compute_node_io_loop(this); + else { + pack(&schema, &schema_size); + MPIFS_global_obj->part_time_io_node_loop(schema, schema_size , this); + } + /* Commented out for testing purposes */ +// read_io_count_++; +} + + +void ArrayGroup::general_read() +{ + int *schema, schema_size; + int node_type = MPIFS_global_obj->node_type(); + + /* Assign id numbers to each array */ + assign_id(); + + op_type_ = GENERAL_READ; + if (node_type == COMPUTE_NODE){ + MPIFS_global_obj->send_group_schema(this); + MPIFS_global_obj->compute_node_io_loop(this); + } + else if (node_type == PART_TIME_COMPUTE) + MPIFS_global_obj->compute_node_io_loop(this); + else { + pack(&schema, &schema_size); + MPIFS_global_obj->part_time_io_node_loop(schema, schema_size , this); + } + /* Commented out for testing purposes */ +// read_io_count_++; +} + + + + +/* Given an array id find the array object in the array group * + * The code caches the previous search value and starts the * + * search from there. This helps especially in the case of * + * when the arrays are accessed sequentially */ +Array* ArrayGroup::find_array(int array_id) +{ + Cell* list_ptr = (list_ != NULL ? + (list_->old_search_val_ != NULL ? list_->old_search_val_ : + list_->head_) + : NULL); + Array* array_ptr; + + while(list_ptr) + { + array_ptr = (Array *) list_ptr->item(); + if (array_ptr->array_id() == array_id) + { + list_->old_search_val_ = list_ptr->next(); + return array_ptr; + } + list_ptr = list_ptr->next(); + } + + list_ptr = list_->head_; + while (list_->old_search_val_ && (list_ptr != list_->old_search_val_)) + { + array_ptr = (Array *) list_ptr->item(); + if (array_ptr->array_id() == array_id) + { + list_->old_search_val_ = list_ptr->next(); + return array_ptr; + } + list_ptr = list_ptr->next(); + } + return NULL; +} + +/* Pack the arrays into an integer schema buffer. Assumes that the * + * data is already allocated. */ +void ArrayGroup::pack_arrays(int **schema_buf, Boolean common_layouts) +{ + Cell* list_ptr = (list_ != NULL ? list_->head_ : NULL); + Array* array_ptr; + + while(list_ptr) + { + array_ptr = (Array *) list_ptr->item(); + array_ptr->pack(schema_buf, common_layouts); + list_ptr = list_ptr->next(); + } + +} + + +int ArrayGroup::op_type(){return op_type_;} + +void ArrayGroup::set_simulate(){simulate_ = YES;} + +void ArrayGroup::reset_simulate(){simulate_ = NO;} + +void ArrayGroup::set_simulate_mode(){simulate_ = YES;} + +void ArrayGroup::reset_simulate_mode(){simulate_ = NO;} + +Boolean ArrayGroup::simulate(){return simulate_;} + + +void ArrayGroup::set_verify(){verify_ = YES;} + +void ArrayGroup::reset_verify(){verify_ = NO;} + +void ArrayGroup::set_verify_mode(){verify_ = YES;} + +void ArrayGroup::reset_verify_mode(){verify_ = NO;} + +Boolean ArrayGroup::verify(){return verify_;} + +/* This function is called on each compute node side and after all the + * arrays have been assigned an id. For each array in the arraygroup, + * the function computes the total number of bytes on the compute node + */ +void ArrayGroup::init_array_info(int *num_arrays, int **array_bytes_to_go) +{ + int *tmp_buf = (int *) malloc(sizeof(int)*num_of_arrays_); + *array_bytes_to_go = tmp_buf; + *num_arrays = num_of_arrays_; + + for(int i=0; i< num_of_arrays_; i++) + tmp_buf[i] = find_array(i)->array_info(); +} + +int ArrayGroup::io_strategy(){ + return io_strategy_; +} + +void ArrayGroup::set_io_strategy(int new_strategy){ + io_strategy_ = new_strategy; +} + +int ArrayGroup::num_of_arrays() +{ + return num_of_arrays_; +} diff --git a/src/Panda/ArrayGroup.h b/src/Panda/ArrayGroup.h new file mode 100644 index 0000000..0cd741b --- /dev/null +++ b/src/Panda/ArrayGroup.h @@ -0,0 +1,75 @@ +#ifndef Arraygroup_dot_h +#define Arraygroup_dot_h + +class Array; +class ArrayLayout; +#include "List.h" +#include "definitions.h" + +class ArrayGroup { + protected: + char *name_; /* Name of the arraygroup */ + int num_of_arrays_; /* Number of arrays in group */ + List *list_; /* List of arrays */ + int io_strategy_; + + /* If all the arrays have the same io and compute node layouts */ + Boolean common_layouts_; + int common_layout_rank_; + ArrayLayout *compute_layout_; + Distribution *compute_distribution_; + ArrayLayout *io_layout_; + Distribution *io_distribution_; + + + int group_io_count_; + int read_io_count_; + int checkpoint_count_; + int op_type_; + + + Boolean interleaved_; + Boolean simulate_; + Boolean verify_; + + void do_init(); + void delete_arrays(); + void assign_id(); + void pack_arrays(int**, Boolean); + + public: + + ArrayGroup(); + ArrayGroup(char *); + virtual ~ArrayGroup(); + void insert(Array*); + void pack(int**, int*); + void unpack(int**); + void timestep(); + void general_write(); + void checkpoint(); + void restart(); + void read_timestep(); + void general_read(); + Array *find_array(int); + int op_type(); + void set_simulate(); + void reset_simulate(); + void set_simulate_mode(); + void reset_simulate_mode(); + Boolean simulate(); + Boolean verify(); + void set_verify(); + void reset_verify(); + void set_verify_mode(); + void reset_verify_mode(); + void unpack_arrays(int**); + void init_array_info(int*,int**); + void set_io_strategy(int); + int io_strategy(); + int num_of_arrays(); + void clear(); +}; + +#endif + diff --git a/src/Panda/ArrayLayout.C b/src/Panda/ArrayLayout.C new file mode 100644 index 0000000..1398ef4 --- /dev/null +++ b/src/Panda/ArrayLayout.C @@ -0,0 +1,179 @@ +#include "definitions.h" +#include "ArrayLayout.h" + +ArrayLayout::ArrayLayout(int Rank, int *sizearray):Template(Rank, sizearray){} + +/* Create an arraylayout object using info stored in the schema buffer */ +ArrayLayout::ArrayLayout(int **schema_buf) +{ + int* ptr = *schema_buf; + + rank_ = *ptr++; + size_ = (int *) malloc(sizeof(int)*rank_); + for(int i=0; i < rank_; i++) + size_[i] = *ptr++; + + *schema_buf = ptr; +} + +/* Make a copy of an existing ArrayLayout object */ +ArrayLayout::ArrayLayout(ArrayLayout *old_layout) +{ + rank_ = old_layout->rank(); + size_ = copy_int_list(rank_, old_layout->size()); +} + +/* Use the destructor of the Template object */ +ArrayLayout::~ArrayLayout() +{ +} + +/* converts a chunk index to a number */ +int ArrayLayout::convert_from_index_to_number(int *indices) +{ + int result=0, temp_product=1; + for(int i=rank_-1; i>=0; i--) + { + result += temp_product * indices[i]; + temp_product *= size_[i]; + } + return result; +} + +/* converts a number to the appropriate chunk index */ +void ArrayLayout::convert_from_number_to_index(int num, int *result) +{ + int temp_product=1; + int i, j; + + for(i = 0; i< rank_; i++) + { + temp_product = 1; + for(j = i+1 ; j < rank_; j++) + temp_product *= size_[j]; + result[i] = num / temp_product; + num -= num/temp_product *temp_product; + } +} + +/* converts a number to the appropriate chunk index */ +int* ArrayLayout::convert_from_number_to_index(int num) +{ + int* result = (int *) malloc(sizeof(int)*rank_); + convert_from_number_to_index(num, result); + return result; +} + +/* Check if the input indices are valid. Assumes that the rank of * + * input indices are the same as rank of the layout */ +Boolean ArrayLayout::valid_index(int *indices) +{ + if (indices == NULL) return NO; + else for(int i=0; i<rank_; i++) + { + if ((indices[i] < 0) || (indices[i] >= size_[i])) + return NO; + } + return YES; +} + +/* Checks if the specified input distribution of the array is * + * compatible with the layout. It is compatible only if the * + * number of dimensions in which the array is distributed in a * + * BLOCK or CYCLIC fashion is equal to the rank of the layout */ +Boolean ArrayLayout::valid_distribution(int array_rank, Distribution* dist) +{ + if ((array_rank <= 0) || (dist == NULL)) return NO; + else { + int block_or_cyclic=0, i; + for (i=0;i<array_rank;i++) + if ((dist[i]==BLOCK)||(dist[i]==CYCLIC)) block_or_cyclic++; + if (block_or_cyclic != rank_) + return NO; + else + return YES; + } +} + + +int ArrayLayout::size(int i){return size_[i];} + + + + +/* This function is used to return a linked list of numbers (representing the + * indices of the compute node chunks which overlap with the io node chunk) + * given the base, size of the overlapping layout (??). + * + * The function assumes that the input is valid + */ +void ArrayLayout::indices_list(int *index_base, int *index_size, + int *num, int *ret_list) +{ + int *ptr=ret_list; + int size=1; + for(int i=0; i < rank(); i++) + size *= index_size[i]; + *num = size; + calculate_indices(index_base, index_size, rank(), 0, &ptr); +} + +/* Recursive function to convert a layout into a list of numbers */ +void ArrayLayout::calculate_indices(int *index_base, int *index_size, + int my_rank, int sum, int **buf_ptr) +{ + int prod=1, i; + int *ptr; + +#ifdef DEBUG + printf("In calculate indices rank=%d sum=%d *buf=%ld\n", my_rank, sum, *buf_ptr); +#endif + if (my_rank > 1) + { + for(i=rank()-1; i > (rank() - my_rank) ; i--) + prod *= size_[i]; + for(i=0 ; i < index_size[rank()-my_rank]; i++) + calculate_indices(index_base, index_size, my_rank-1, + sum + (index_base[rank()-my_rank]+i)*prod, buf_ptr); + } + else + { + for(i=0; i < index_size[rank()-my_rank]; i++) + { + ptr = *buf_ptr; + *ptr++ = sum + (index_base[rank()-my_rank]+i); +#ifdef DEBUG + printf("In calculate indices *buf=%ld val=%d\n", *buf_ptr, **buf_ptr); +#endif + *buf_ptr = ptr; + } + } +} + +/* Pack the info into the schema buffer */ +void ArrayLayout::pack(int **schema_buf) +{ + int* ptr = *schema_buf; + + *ptr++ = rank_; + for(int i=0; i< rank_; i++) + *ptr++ = size_[i]; + + *schema_buf = ptr; +} + +/* Check if the two layouts are equal */ +Boolean ArrayLayout::equal(ArrayLayout *layout) +{ + if (rank_ != layout->rank()) return NO; + for(int i=0; i<rank_; i++) + if (size_[i] != layout->size(i)) return NO; + return YES; +} + + + +int* ArrayLayout::size(){return size_;} + + + diff --git a/src/Panda/ArrayLayout.h b/src/Panda/ArrayLayout.h new file mode 100644 index 0000000..d5e0a65 --- /dev/null +++ b/src/Panda/ArrayLayout.h @@ -0,0 +1,26 @@ +#ifndef ArrayLayout_dot_h +#define ArrayLayout_dot_h + +#include "Template.h" + +class ArrayLayout : public Template { + /* Inherits rank_,size_ from Template */ + public: + ArrayLayout(int Rank, int *sizearray); + ArrayLayout(int** schema_buf); + ArrayLayout(ArrayLayout *old_layout); + virtual ~ArrayLayout(); + void pack(int** schema_buf); + int convert_from_index_to_number(int *indices); + int* convert_from_number_to_index(int num); + void convert_from_number_to_index(int num, int *result); + Boolean valid_index(int *); + Boolean valid_distribution(int, Distribution*); + Boolean equal(ArrayLayout*); + int size(int); + int* size(); + void indices_list(int*, int*, int*, int*); + void calculate_indices(int*,int*,int,int,int**); +}; + +#endif diff --git a/src/Panda/Attribute.C b/src/Panda/Attribute.C new file mode 100644 index 0000000..0c50f04 --- /dev/null +++ b/src/Panda/Attribute.C @@ -0,0 +1,187 @@ +#include "definitions.h" +#include "Attribute.h" +#include "MPIFS.h" +#include "string.h" + + +extern MPIFS *MPIFS_global_obj; +extern "C" { + int IOwriteAttribute(IOFile,char*,int,int,void *); + int IOsizeOf(int); + int IOreadAttributeInfo(IOFile,char*,int*,int*); + int IOreadAttribute(IOFile,int,void*); +// IOFile IEEEopen(char *,char *); +} + +Attribute::Attribute() +{ + name_ = NULL; + data_status_ = 0; + data_ = NULL; +} + +void Attribute::init(char *name) +{ + int len = strlen(name); + name_ = (char *)malloc(sizeof(char) * (len + 1)); + for (int i=0; i<len; i++) name_[i] = name[i]; + name_[i] = '\0'; +} + +void Attribute::init(char *name, int esize, int count, void *data) +{ + int len = strlen(name); + name_ = (char *)malloc(sizeof(char) * (len + 1)); + for (int i=0; i<len; i++) name_[i] = name[i]; + name_[i] = '\0'; + esize_ = esize; + count_ = count; + data_ = data; + data_status_ = 0; +} + +Attribute::~Attribute() +{ + if (name_) free(name_); + if (data_status_ && data_) free(data_); +} + +void Attribute::pack(int &schema_len, char *&schema, char *fname, int op_type) +{ + union int_to_char tmp; + int i, real_size = IOsizeOf(esize_); + + int len1 = strlen(fname); + int len = strlen(name_); + if (op_type == TIMESTEP) + schema_len = 5 * sizeof(int) + len1 + len + real_size * count_; + else schema_len = 3 * sizeof(int) + len1 + len; + schema = (char *)malloc(sizeof(char) * schema_len); + char *ptr = schema; + + tmp.i = op_type; + for (i=0; i<4; i++) *ptr++ = tmp.c[i]; + tmp.i = len1; + for (i=0; i<4; i++) *ptr++ = tmp.c[i]; + for (i=0; i<len1; i++) *ptr++ = fname[i]; + tmp.i = len; + for (i=0; i<4; i++) *ptr++ = tmp.c[i]; + for (i=0; i<len; i++) *ptr++ = name_[i]; + + if (op_type == TIMESTEP) { + tmp.i = esize_; + for (i=0; i<4; i++) *ptr++ = tmp.c[i]; + tmp.i = count_; + for (i=0; i<4; i++) *ptr++ = tmp.c[i]; + memcpy(ptr, data_, real_size * count_); + } +} + +Attribute::Attribute(char *schema, int op_type) +{ + union int_to_char tmp; + int i, len, real_size; + char *ptr = schema; + + for (i=0; i<4; i++) tmp.c[i] = *ptr++; + len = tmp.i; + name_ = (char *)malloc(sizeof(char) * (len + 1)); + for (i=0; i<len; i++) name_[i] = *ptr++; + name_[i] = '\0'; + + if (op_type == TIMESTEP) { + for (i=0; i<4; i++) tmp.c[i] = *ptr++; + esize_ = tmp.i; + real_size = IOsizeOf(esize_); + for (i=0; i<4; i++) tmp.c[i] = *ptr++; + count_ = tmp.i; + data_ = (void *)malloc(esize_ * count_); + memcpy(data_, ptr, real_size * count_); + data_status_ = 1; + } +} + +void Attribute::read(char *fname, char *n) +{ + int node_type = MPIFS_global_obj->node_type(); + IOFile fp; + + if (node_type == PART_TIME_COMPUTE || node_type == COMPUTE_NODE) { + if (MPIFS_global_obj->am_master_compute_node()) { + init(n); + MPIFS_global_obj->send_attr_schema(this, fname, READ_TIMESTEP); + } + MPIFS_global_obj->receive_attr_data(this); + } else { // PART_TIME_IO + init(n); + if (MPIFS_global_obj->am_master_compute_node()) + MPIFS_global_obj->send_attr_schema(this, fname, READ_TIMESTEP); + MPIFS_global_obj->receive_attr_schema(); + + int len = strlen(fname); + char *name = (char *)malloc(sizeof(char) * (len+1)); + char *name1 = (char *)malloc(sizeof(char) * (len+6)); + for (int i=0; i<len; i++) name[i] = fname[i]; + name[i] = '\0'; + sprintf(name1, "%s.%d", name, MPIFS_global_obj->my_rank(IO_NODE)); + fp = MPIFS_global_obj->open_file(name1, READ_TIMESTEP); + read_data(fp); + if (MPIFS_global_obj->am_master_io_node()) { + MPIFS_global_obj->send_attr_data(this); + } + MPIFS_global_obj->receive_attr_data(this); + free(name); + } +} + +void Attribute::write(char *fname, char *n, int esize, int count, void *data) +{ + int node_type = MPIFS_global_obj->node_type(); + + if (node_type == PART_TIME_COMPUTE || node_type == COMPUTE_NODE) { + if (MPIFS_global_obj->am_master_compute_node()) { + init(n, esize, count, data); + MPIFS_global_obj->send_attr_schema(this, fname, TIMESTEP); + } + } else { // PART_TIME_IO + init(n, esize, count, data); + if (MPIFS_global_obj->am_master_compute_node()) + MPIFS_global_obj->send_attr_schema(this, fname, TIMESTEP); + MPIFS_global_obj->receive_attr_schema(); + + IOFile fp; + int len = strlen(fname); + char *name = (char *)malloc(sizeof(char) * (len+1)); + char *name1 = (char *)malloc(sizeof(char) * (len+6)); + for (int i=0; i<len; i++) name[i] = fname[i]; + name[i] = '\0'; + sprintf(name1, "%s.%d", name, MPIFS_global_obj->my_rank(IO_NODE)); + + fp = MPIFS_global_obj->open_file(name1, TIMESTEP); + + write_data(fp); + free(name); + } +} + +void Attribute::write_data(IOFile fp) +{ + IOwriteAttribute(fp, name_, esize_, count_, data_); +} + +void Attribute::read_data(IOFile fp) +{ + int index = IOreadAttributeInfo(fp, name_, &esize_, &count_); + if (index >= 0) { + data_ = (void *)malloc(IOsizeOf(esize_) * count_); + IOreadAttribute(fp, index, data_); + } else printf("Fail to read attribute %s\n", name_); +} + +void *Attribute::get_data_ptr() { return data_; } +void Attribute::set_data_ptr(void *d) { data_ = d; } +int Attribute::data_size() { return IOsizeOf(esize_) * count_; } +int Attribute::esize() { return esize_; } +int Attribute::count() { return count_; } +void Attribute::set_count(int c) { count_ = c; } +void Attribute::set_esize(int e) { esize_ = e; } diff --git a/src/Panda/Attribute.h b/src/Panda/Attribute.h new file mode 100644 index 0000000..d948316 --- /dev/null +++ b/src/Panda/Attribute.h @@ -0,0 +1,43 @@ +#ifndef Attribute_dot_h +#define Attribute_dot_h + +#include "definitions.h" + + +typedef union int_to_char { + int i; + char c[4]; +} int_to_char; + +//#include "../IEEEIO/IEEEIO.h" +//#include "../IEEEIO/IOProtos.h" +#include "external/IEEEIO/src/Arch.h" + +class Attribute { + char *name_; + int esize_; + int count_; + void *data_; + int data_status_; // 0: no alloc, 1: alloc + +public: + Attribute(); + Attribute(char *, int); + ~Attribute(); + void init(char *, int, int, void *); + void init(char *); + void pack(int &, char *&, char *, int); + void write(char *, char *, int, int, void *); + void read(char *, char *); + void write_data(IOFile); + void read_data(IOFile); + void *get_data_ptr(); + void set_data_ptr(void *); + int data_size(); + int esize(); + int count(); + void set_esize(int); + void set_count(int); +}; + +#endif diff --git a/src/Panda/CSDIO.C b/src/Panda/CSDIO.C new file mode 100644 index 0000000..b2d4064 --- /dev/null +++ b/src/Panda/CSDIO.C @@ -0,0 +1,694 @@ +#include "definitions.h" +#include "ArrayGroup.h" +#include "MPIFS.h" +#include "Chunk.h" +#include "App_Info.h" +#include "Array.h" +#include "message.h" +#include "CSDIO.h" +#include "List.h" + + +extern MPIFS* MPIFS_global_obj; +extern int SUBCHUNK_SIZE; + +/* This code is executed on the compute nodes (excluding the part-time i/o + * nodes). + */ +void CSDIO::compute_node_io_loop(ArrayGroup *group) +{ + int array_idx; + Boolean read_op; + + op_type_ = group->op_type(); + if ((op_type_ == RESTART) || (op_type_ == GENERAL_READ) || + (op_type_ == READ_TIMESTEP)){ + read_op = YES; + } else { + read_op = NO; + } + + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank_); + num_of_arrays_ = group->num_of_arrays(); + receive_io_app_info(); + num_io_nodes_ = io_app_info_->app_size(); +#ifdef DEBUG + printf("%d: op_type_ = %d read_op =%d\n", world_rank_, op_type_, read_op); + printf("%d: Compute node - num of arrays %d - num of io_nodes %d\n", + world_rank_, num_of_arrays_, num_io_nodes_); +#endif + comp_current_array_ = new Array(); + comp_current_array_id_ = -1; + + for(array_idx = 0; array_idx < num_of_arrays_; array_idx++){ + while(!process_compute_side_array(group, array_idx, read_op)){}; + } + delete comp_current_array_; + comp_current_array_ = NULL; +} + +/* An array is stored in the comp_current_array_. this must be instatntiated + * before use. If the input array_id is the same as that stored in + * comp_current_array_id_, then it means that all the required sends/recvs + * have been posted and all we have to do is to verify its completion. If + * they are different, then it means that we have to start the i/o for the + * new array. + */ +Boolean CSDIO::process_compute_side_array(ArrayGroup *group, + int array_idx, Boolean read_op) +{ + int make_subchunks=-1, tag, tag_ctr=0, buf_size, bytes_to_go, flag, i; + char *tmp_buf; + void *void_buf; + Chunk *compute_chunk=NULL, *io_chunk=NULL, *subchunk=NULL; + + if (comp_current_array_id_ != array_idx){ + /* We have to post the sends/recvs for this array*/ + + comp_current_array_->copy(group->find_array(array_idx)); + comp_array_rank_ = comp_current_array_->rank(); + if (comp_array_rank_ > max_comp_rank_){ + realloc_compute_schema_bufs(comp_array_rank_); + } + + nat_chunked_ = comp_current_array_->nat_chunked(); + sub_chunked_ = comp_current_array_->sub_chunked(); + if (nat_chunked_ && !sub_chunked_) + contiguous_ = YES; + else + contiguous_ = NO; + compute_pending_ = 0; + + if (contiguous_){ + /* Nat chunking with no user-specified chunking. We don't need + * to use any MPI dervied datatypes. + */ + comp_current_array_->list_clear(); + compute_chunk = comp_current_array_->get_next_chunk(); + while(compute_chunk != NULL){ + comp_current_chunk_id_ = compute_chunk->chunk_id(); + io_overlaps_ = 1; + io_overlap_chunk_ids_[0] = comp_current_chunk_id_; + io_dest_ids_[0] = io_app_info_->world_rank(comp_current_array_->which_node( + comp_current_chunk_id_, + IO_NODE, num_io_nodes_)); + + if (io_dest_ids_[0] == world_rank_){ + /* Part-time case - do nothing, the io node should take + care of this */ + } + else { + bytes_to_go = compute_chunk->total_size_in_bytes(); + tmp_buf = (char *)compute_chunk->data_ptr(); + tag_ctr = 0; + while(bytes_to_go > 0){ + buf_size = min(SUBCHUNK_SIZE, bytes_to_go); + if (compute_pending_ >= max_pending_){ + realloc_pending_messages(compute_pending_+1); + } + + tag = comp_current_chunk_id_ * 1000 + tag_ctr*10; + if (read_op) + nb_receive_message((void *) tmp_buf, buf_size, MPI_CHAR, + io_dest_ids_[0], tag + CHUNK_DATA_FROM_IO, + MPI_COMM_WORLD, + &comp_requests_[compute_pending_]); + else + nb_send_message((void *) tmp_buf, buf_size, MPI_CHAR, + io_dest_ids_[0], tag + CHUNK_DATA_TO_IO, + MPI_COMM_WORLD, + &comp_requests_[compute_pending_]); + + tag_ctr++; + tmp_buf += buf_size; + bytes_to_go -= buf_size; + compute_pending_++; + } + } + + compute_chunk = comp_current_array_->get_next_chunk(); + } + + comp_current_array_->list_clear(); + } /* End if contiguous */ + else { + /* We have to use mpi-derived datatypes */ + make_subchunks = -1; + io_chunk = new Chunk(); + subchunk = new Chunk(); + comp_current_array_->list_clear(); + + compute_chunk = comp_current_array_->get_next_chunk(); + while (compute_chunk != NULL){ + comp_current_chunk_id_ = compute_chunk->chunk_id(); + + /* Determine the overlapping I/O chunks */ + io_chunk_overlaps(comp_current_array_, compute_chunk); + for( i=0;i< io_overlaps_;i++){ + if (io_dest_ids_[i] != world_rank_){ + /* Different node- so we have to post the send/recv */ + io_chunk->init(comp_current_array_, io_overlap_chunk_ids_[i], + IO_NODE, NO_ALLOC); + if (!sub_chunked_ && (make_subchunks == -1)){ + comp_current_array_->make_sub_chunks(io_chunk); + make_subchunks = 1; + } + + tag_ctr=0; + comp_num_of_subchunks_ = + comp_current_array_->layout(SUB_CHUNK)->total_elements(); +#ifdef DEBUG + printf("comp_num_of_subchunks_ = %d\n", comp_num_of_subchunks_); +#endif + for(comp_current_subchunk_id_ = 0; + comp_current_subchunk_id_ < comp_num_of_subchunks_; + comp_current_subchunk_id_++){ +#ifdef DEBUG + printf("io_chunk = %d subchunk_id = %d\n", + io_chunk->chunk_id(), comp_current_subchunk_id_); +#endif + subchunk->init(io_chunk, comp_current_subchunk_id_, NO_ALLOC); + subchunk->compute_overlap(compute_chunk, comp_overlap_base_, + comp_overlap_size_, comp_overlap_stride_); + buf_size = num_elements(comp_array_rank_, comp_overlap_size_); + if (buf_size > 0){ + /* Something to send */ + if (compute_pending_ >= max_pending_){ + realloc_pending_messages(compute_pending_+1); + } + void_buf = (void *)tmp_buf; + compute_chunk->make_datatype(comp_overlap_base_, comp_overlap_size_, + comp_overlap_stride_, + &void_buf, + &comp_datatypes_[compute_pending_]); + tmp_buf = (char *)void_buf; + tag = io_chunk->chunk_id()*1000 + tag_ctr*10; + if (read_op) + nb_receive_message((void *) tmp_buf, 1, + comp_datatypes_[compute_pending_], + io_dest_ids_[i], + tag + CHUNK_DATA_FROM_IO, + MPI_COMM_WORLD, + &comp_requests_[compute_pending_]); + else + nb_send_message((void *) tmp_buf, 1, + comp_datatypes_[compute_pending_], + io_dest_ids_[i], tag + CHUNK_DATA_TO_IO, + MPI_COMM_WORLD, + &comp_requests_[compute_pending_]); + compute_pending_++; + } + tag_ctr++; + } + } + } + compute_chunk = comp_current_array_->get_next_chunk(); + } + } + comp_current_array_id_ = array_idx; + return NO; + } else { + if (part_time_io_){ + /* Just test and get back to io-node stuff */ + MPI_Testall(compute_pending_, comp_requests_, &flag, comp_statuses_); + if (flag){ + if (!contiguous_){ + for(i=0; i<compute_pending_;i++) + MPI_Type_free(&comp_datatypes_[i]); + } + compute_pending_ = 0; + comp_current_array_->clear(); + return YES; + } + } else { +#ifdef DEBUG + printf("%d: Waiting for %d messages to complete\n", world_rank_, + compute_pending_); +#endif + MPI_Waitall(compute_pending_, comp_requests_, comp_statuses_); + if (!contiguous_){ + for(i=0; i<compute_pending_;i++) + MPI_Type_free(&comp_datatypes_[i]); + } +#ifdef DEBUG + printf("%d: Done waiting \n", world_rank_); +#endif + + compute_pending_ = 0; + comp_current_array_->clear(); + return YES; + } + return NO; + } +} + + +void CSDIO::start_to_finish(Boolean part_time, + ArrayGroup *compute_group) +{ + int array_idx, make_subchunks, bytes_to_go, buf_size, tag_ctr, tag; + Boolean read_op, part_time_done; + Chunk *chunk=NULL, *subchunk=NULL, *compute_chunk=NULL; + + /* Don't ask me why. Ask szu-Wen */ + comp_current_array_id_ = -1; + + if ((op_type_ == RESTART) || (op_type_ == GENERAL_READ) || + (op_type_ == READ_TIMESTEP)){ + read_op = YES; + } else { + read_op = NO; + } + + part_time_io_ = part_time; + compute_node_group_ = compute_group; + comp_current_array_ = NULL; + if (part_time_io_){ + comp_current_array_ = new Array(); + } + + /* Receive the i/o node information */ + receive_io_app_info(); + + /* To reduce costs associated with object creation and deletion, we * + * will create a dummy chunk,subchunk and compute chunk object and * + * re-initialize them whenever necessary. */ + chunk = new Chunk(); + current_chunk_ = chunk; + subchunk = new Chunk(); + compute_chunk = new Chunk(); + + for(array_idx=0; array_idx<num_of_arrays_; array_idx++){ + + if (part_time_io_) + part_time_done = process_compute_side_array(compute_group, array_idx, read_op); + + make_subchunks = -1; + current_array_ = find_array(array_idx); + nat_chunked_ = current_array_->nat_chunked(); + sub_chunked_ = current_array_->sub_chunked(); + if (nat_chunked_ && !sub_chunked_) + contiguous_ = YES; + else + contiguous_ = NO; + + array_rank_ = current_array_->rank(); + if (array_rank_ > max_rank_){ + realloc_schema_bufs(array_rank_); + } + + num_of_chunks_ = current_array_->layout(IO_NODE)->total_elements(); + current_chunk_id_ = current_array_->get_next_index(-1, my_io_rank_, + num_io_nodes_); + if (contiguous_){ + /* Natural chunked and no-user specified subchunking */ + + while(current_chunk_id_ < num_of_chunks_){ + num_overlaps_ = 1; + overlap_chunk_ids_[0] = current_chunk_id_; + dest_ids_[0] = app_info_->world_rank(current_array_->which_node( + current_chunk_id_, COMPUTE_NODE)); + if (part_time_io_ && (world_rank_ == dest_ids_[0])){ + direct_io(array_idx, current_chunk_id_, read_op, NULL, NULL); + } else { + chunk->init(current_array_, current_chunk_id_, IO_NODE, NO_ALLOC); + bytes_to_go = chunk->total_size_in_bytes(); + chunk->set_data_ptr(mem_buf_); + + /* We don't have to make the schema requests - just post the + send/recv */ + tag_ctr = 0; + while (bytes_to_go > 0){ + buf_size = min(SUBCHUNK_SIZE, bytes_to_go); + tag = current_chunk_id_*1000+tag_ctr*10; + if (read_op) { + read_data(mem_buf_, buf_size); + nb_send_message((void *)mem_buf_, buf_size, MPI_CHAR, + dest_ids_[0], + tag+CHUNK_DATA_FROM_IO, MPI_COMM_WORLD, + &requests_[0]); + wait_for_completion(); + } else { + nb_receive_message((void *)mem_buf_, buf_size, MPI_CHAR, + dest_ids_[0],tag+CHUNK_DATA_TO_IO, + MPI_COMM_WORLD, &requests_[0]); + + wait_for_completion(); + write_data(mem_buf_, buf_size, chunk->element_size()); + } + bytes_to_go -= buf_size; + tag_ctr++; + } + chunk->set_data_ptr(NULL); + } + + current_chunk_id_ = current_array_->get_next_index(current_chunk_id_, + my_io_rank_, + num_io_nodes_); + } + } /* End if contiguous_ */ + else { /* Have to use MPI-derived datatypes */ + + while(current_chunk_id_ < num_of_chunks_){ + chunk->init(current_array_, current_chunk_id_, IO_NODE, NO_ALLOC); + if (!sub_chunked_ && (make_subchunks == -1)){ + current_array_->make_sub_chunks(chunk); + make_subchunks = 1; + } + num_of_subchunks_=current_array_->layout(SUB_CHUNK)->total_elements(); + tag_ctr=0; + + for(current_subchunk_id_ = 0; current_subchunk_id_ < num_of_subchunks_; + current_subchunk_id_++){ + subchunk->init(current_chunk_, current_subchunk_id_, NO_ALLOC); + bytes_to_go = subchunk->total_size_in_bytes(); + + if (bytes_to_go > mem_buf_size_){ + realloc_mem_bufs(bytes_to_go); + } + subchunk->set_data_ptr(mem_buf_); + + compute_chunk_overlaps(current_array_, subchunk); + compute_schemas(current_array_, subchunk, compute_chunk, array_idx); + + tag = current_chunk_id_ * 1000 + tag_ctr*10; + if (read_op){ + read_data(subchunk); + send_data_to_compute_nodes(subchunk, tag); + wait_for_completion(); + } else { + receive_data_from_compute_nodes(subchunk, tag); + wait_for_completion(); + write_data(subchunk); + } + tag_ctr++; + subchunk->set_data_ptr(NULL); + } + current_chunk_id_ = current_array_->get_next_index(current_chunk_id_, + my_io_rank_, + num_io_nodes_); + } + } + if (part_time_io_) + while (!process_compute_side_array(compute_group, array_idx, read_op)){}; + } + + /* Free the temp chunk objects */ + delete(chunk); + delete(subchunk); + delete(compute_chunk); + chunk = current_chunk_ = subchunk = compute_chunk = NULL; + if (comp_current_array_){ + delete(comp_current_array_); + comp_current_array_ = NULL; + } +} + + +/* This constructor is for pure io_nodes only */ +CSDIO::CSDIO(int *schema_string, int schema_size, int world_rank, + int comp_app_num, int comp_app_size, App_Info *app_info): + Simple_IO(schema_string, schema_size, world_rank, comp_app_num, + comp_app_size, app_info) +{ + clear(); +} + + +/* This call is for compute nodes only */ +CSDIO::CSDIO() +{ + do_init(); +} + +CSDIO::CSDIO(int *schema_string, int schema_size, int world_rank, + int comp_app_num, int comp_app_size, App_Info *app_info, Boolean part_time): + Simple_IO(schema_string, schema_size, world_rank, comp_app_num, + comp_app_size, app_info) +{ + if (part_time){ + /* This is a part-time i/o node */ + do_init(); + part_time_io_ = part_time; + } else { + clear(); + } +} + +void CSDIO::clear() +{ + comp_datatypes_ = NULL; + comp_requests_ = NULL; + comp_statuses_ = NULL; + io_overlap_chunk_ids_ = io_dest_ids_ = comp_overlap_base_ = NULL; + comp_overlap_size_ = comp_overlap_stride_ = NULL; + io_app_info_ = NULL; +} + +void CSDIO::do_init() +{ + max_pending_ = 1; + compute_pending_ = 0; + comp_datatypes_ = (MPI_Datatype *)malloc(sizeof(MPI_Datatype)*max_pending_); + comp_requests_ = (MPI_Request *)malloc(sizeof(MPI_Request)*max_pending_); + comp_statuses_ = (MPI_Status *)malloc(sizeof(MPI_Status)*max_pending_); + + io_max_overlaps_ = 1; + io_overlaps_ =0; + io_overlap_chunk_ids_ = (int *) malloc(sizeof(int)*io_max_overlaps_); + io_dest_ids_ = (int *) malloc(sizeof(int)*io_max_overlaps_); + + max_comp_rank_ = 10; + comp_array_rank_ = 0; + comp_overlap_base_ = (int *) malloc(sizeof(int)*max_comp_rank_); + comp_overlap_size_ = (int *) malloc(sizeof(int)*max_comp_rank_); + comp_overlap_stride_ = (int *) malloc(sizeof(int)*max_comp_rank_); +} + + + + +CSDIO::~CSDIO() +{ + if (part_time_io_ || dummy_){ + if (comp_datatypes_) free(comp_datatypes_); + if (comp_requests_) free(comp_requests_); + if (comp_statuses_) free(comp_statuses_); + if (comp_overlap_base_) free(comp_overlap_base_); + if (comp_overlap_size_) free(comp_overlap_size_); + if (comp_overlap_stride_) free(comp_overlap_stride_); + if (io_overlap_chunk_ids_) free(io_overlap_chunk_ids_); + if (io_dest_ids_) free(io_dest_ids_); + if (comp_current_array_) delete(comp_current_array_); + if (io_app_info_) delete(io_app_info_); + }; + clear(); +} + +void CSDIO::receive_io_app_info() +{ + int node_type = MPIFS_global_obj->node_type(); + int num_of_world_nodes, app_info_buf_size, *app_info_buf; + int tag = APP_INFO * 10 + SPECIAL; + App_Info *tmp_info = NULL; + MPI_Status app_status; + + MPI_Comm_size(MPI_COMM_WORLD, &num_of_world_nodes); + app_info_buf_size = num_of_world_nodes+2; /* Num io nodes <= total nodes */ + app_info_buf = (int *)malloc(sizeof(int)*app_info_buf_size); + + if (node_type == IO_NODE){ + /* Master i/o node sends io app info to the master compute node */ + if (MPIFS_global_obj->am_master_io_node()){ + tmp_info = MPIFS_global_obj->io_app_info(); + app_info_buf[0] = tmp_info->app_num(); + app_info_buf[1] = tmp_info->app_size(); + tmp_info->world_ranks(&app_info_buf[2]); + app_info_buf_size = app_info_buf[1] + 2; +#ifdef DEBUG +printf("%d:app_num=%d app_size=%d\n", world_rank_, app_info_buf[0], + app_info_buf[1]); +printf("sending messages to %d\n", app_info_->get_master()); +#endif + send_message((void *) app_info_buf, app_info_buf_size, MPI_INT, + app_info_->get_master(), tag, MPI_COMM_WORLD); +#ifdef DEBUG + printf("%d: %d %d %d\n", world_rank_,app_info_buf[0], app_info_buf[1], + app_info_buf[2]); +#endif + } + } else if (node_type == PART_TIME_IO){ + if (MPIFS_global_obj->am_master_io_node()){ + tmp_info = MPIFS_global_obj->io_app_info(); + app_info_buf[0] = tmp_info->app_num(); + app_info_buf[1] = tmp_info->app_size(); + tmp_info->world_ranks(&app_info_buf[2]); + app_info_buf_size = app_info_buf[1] + 2; + + if (MPIFS_global_obj->am_master_compute_node()){ + MPIFS_global_obj->Broadcast(COMPUTE_NODE, (void *) app_info_buf, + app_info_buf_size, MPI_INT, tag); + } else { + send_message((void *)app_info_buf, app_info_buf_size, MPI_INT, + app_info_->get_master(), tag, MPI_COMM_WORLD); + receive_message((void *)app_info_buf, app_info_buf_size, MPI_INT, + MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, + &app_status); + mpi_get_count(&app_status, MPI_INT, &app_info_buf_size); + MPIFS_global_obj->Broadcast(COMPUTE_NODE, (void *) app_info_buf, + app_info_buf_size, MPI_INT, tag); + } + } else { + receive_message((void *)app_info_buf, app_info_buf_size, MPI_INT, + MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, + &app_status); + mpi_get_count(&app_status, MPI_INT, &app_info_buf_size); + MPIFS_global_obj->Broadcast(COMPUTE_NODE, (void *) app_info_buf, + app_info_buf_size, MPI_INT, tag); + } + + io_app_info_ = new App_Info(app_info_buf[0], app_info_buf[1], + &app_info_buf[2]); + } else if (node_type == COMPUTE_NODE) { + receive_message((void *)app_info_buf, app_info_buf_size, MPI_INT, + MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, + &app_status); + mpi_get_count(&app_status, MPI_INT, &app_info_buf_size); +#ifdef DEBUG + printf("%d:app_info_buf_size =%d\n", world_rank_, app_info_buf_size); +#endif + io_app_info_ = new App_Info(app_info_buf[0], app_info_buf[1], + &app_info_buf[2]); + MPIFS_global_obj->Broadcast(COMPUTE_NODE, (void *) app_info_buf, + app_info_buf_size, MPI_INT, tag); + + } else if (node_type == PART_TIME_COMPUTE) { + receive_message((void *)app_info_buf, app_info_buf_size, MPI_INT, + MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, + &app_status); + mpi_get_count(&app_status, MPI_INT, &app_info_buf_size); + io_app_info_ = new App_Info(app_info_buf[0], app_info_buf[1], + &app_info_buf[2]); + MPIFS_global_obj->Broadcast(COMPUTE_NODE, (void *) app_info_buf, + app_info_buf_size, MPI_INT, tag); + + } else { + printf("Error in CSDIO::receive_io_app_info - incorrect node type\n"); + exit(1); + } + free(app_info_buf); + app_info_buf = NULL; +} + +/* Store the schema only for the part-time i/o case. Don't send the any + * schema message. + */ +void CSDIO::send_schema_message(int array_id, int index) +{ + int *ptr = schema_bufs_[index]; + + if (part_time_io_ && (dest_ids_[index] == world_rank_)){ + *ptr++ = array_id; + *ptr++ = overlap_chunk_ids_[index]; + *ptr++ = (int) nat_chunked_; + *ptr++ = (int) contiguous_; + *ptr++ = array_rank_; + *ptr++ = op_type_; + + for(int i=0; i < array_rank_; i++) *ptr++ = overlap_base_[i]; + for(i=0; i < array_rank_; i++) *ptr++ = overlap_size_[i]; + for(i=0; i < array_rank_; i++) *ptr++ = overlap_stride_[i]; + } + } + + +void CSDIO::send_data_to_compute_nodes(Chunk *subchunk, int tag) +{ + for(int i=0; i < num_overlaps_; i++){ + if (part_time_io_ && (dest_ids_[i] == world_rank_)){ + copy_data(subchunk, i, YES, NULL, NULL); + requests_[i] = MPI_REQUEST_NULL; + } else { + nb_send_message((void *)data_ptrs_[i], 1, datatypes_[i], + dest_ids_[i], tag+CHUNK_DATA_FROM_IO,MPI_COMM_WORLD, + &requests_[i]); + } + } +} + +void CSDIO::receive_data_from_compute_nodes(Chunk *subchunk, int tag) +{ + for(int i=0; i < num_overlaps_; i++){ + if (part_time_io_ && (dest_ids_[i] == world_rank_)){ + copy_data(subchunk, i, NO, NULL, NULL); + requests_[i] = MPI_REQUEST_NULL; + } else { + nb_receive_message((void *)data_ptrs_[i], 1, datatypes_[i], + dest_ids_[i], tag+CHUNK_DATA_TO_IO,MPI_COMM_WORLD, + &requests_[i]); + } + } +} + + + +void CSDIO::realloc_compute_schema_bufs(int new_max) +{ + max_comp_rank_ = new_max; + comp_overlap_base_ = (int *) realloc(comp_overlap_base_, new_max*sizeof(int)); + comp_overlap_stride_ = (int *) realloc(comp_overlap_stride_, new_max*sizeof(int)); + comp_overlap_size_ = (int *) realloc(comp_overlap_size_, new_max*sizeof(int)); +} + +void CSDIO::realloc_pending_messages(int new_max) +{ + max_pending_ = new_max; + comp_datatypes_ =(MPI_Datatype *)realloc(comp_datatypes_,new_max*sizeof(MPI_Datatype)); + comp_requests_ = (MPI_Request *)realloc(comp_requests_, new_max*sizeof(MPI_Request)); + comp_statuses_ = (MPI_Status*)realloc(comp_statuses_, new_max*sizeof(MPI_Status)); +} + + +void CSDIO::realloc_io_buffers(int new_max) +{ + io_max_overlaps_ = new_max; + io_overlap_chunk_ids_ =(int*)realloc(io_overlap_chunk_ids_, new_max*sizeof(int)); + io_dest_ids_ = (int *) realloc(io_dest_ids_, new_max*sizeof(int)); +} + +void CSDIO::io_chunk_overlaps(Array *array, Chunk *subchunk) +{ + int num_compute_chunks; + + if (nat_chunked_){ + io_overlaps_ = 1; + io_overlap_chunk_ids_[0] = current_chunk_id_; + } + else{ + + num_compute_chunks = array->layout(IO_NODE)->total_elements(); + if (num_compute_chunks > io_max_overlaps_) realloc_io_buffers(num_compute_chunks); + subchunk->chunk_overlaps(array, &io_overlaps_, + io_overlap_chunk_ids_, IO_NODE); + } + + for(int i=0; i < io_overlaps_;i++) + io_dest_ids_[i] = io_app_info_->world_rank + (array->which_node(io_overlap_chunk_ids_[i], + IO_NODE, num_io_nodes_)); +} + + +void CSDIO::wait_for_completion() +{ + MPI_Waitall(num_overlaps_, requests_, statuses_); + if (!contiguous_) + for(int i=0; i< num_overlaps_;i++) + MPI_Type_free(&datatypes_[i]); +} + +char* CSDIO::name() +{ + return name_; +} + diff --git a/src/Panda/CSDIO.h b/src/Panda/CSDIO.h new file mode 100644 index 0000000..efd1841 --- /dev/null +++ b/src/Panda/CSDIO.h @@ -0,0 +1,60 @@ +#ifndef CSDIO_dot_h +#define CSDIO_dot_h + +#include "Simple_IO.h" +class ArrayGroup; +class Array; +class App_Info; +class Chunk; + +class CSDIO : public Simple_IO +{ + protected: + int compute_pending_; + int max_pending_; + MPI_Datatype *comp_datatypes_; + MPI_Request *comp_requests_; + MPI_Status *comp_statuses_; + int comp_array_rank_; + int max_comp_rank_; + int *comp_overlap_base_; + int *comp_overlap_size_; + int *comp_overlap_stride_; + int io_max_overlaps_; + int io_overlaps_; + int *io_overlap_chunk_ids_; + int *io_dest_ids_; + Array *comp_current_array_; + int comp_current_array_id_; + int comp_current_chunk_id_; + int comp_current_subchunk_id_; + int comp_num_of_subchunks_; + App_Info *io_app_info_; + + Boolean process_compute_side_array(ArrayGroup*,int,Boolean); + void clear(); + void do_init(); + void receive_io_app_info(); + virtual void send_schema_message(int,int); + virtual void send_data_to_compute_nodes(Chunk*,int); + virtual void receive_data_from_compute_nodes(Chunk*,int); + void realloc_compute_schema_bufs(int); + void realloc_pending_messages(int); + void realloc_io_buffers(int); + void io_chunk_overlaps(Array*,Chunk*); + void wait_for_completion(); + + + public: + CSDIO(int*,int,int,int,int,App_Info*); + CSDIO(int*,int,int,int,int,App_Info*,Boolean); + CSDIO(); + virtual ~CSDIO(); + virtual void start_to_finish(Boolean, ArrayGroup*); + virtual void compute_node_io_loop(ArrayGroup*); + virtual char* name(); +}; + + + +#endif diff --git a/src/Panda/CSDIO_Shared.C b/src/Panda/CSDIO_Shared.C new file mode 100644 index 0000000..35e864d --- /dev/null +++ b/src/Panda/CSDIO_Shared.C @@ -0,0 +1,241 @@ +#include "definitions.h" +#include "ArrayGroup.h" +#include "MPIFS.h" +#include "Chunk.h" +#include "App_Info.h" +#include "Array.h" +#include "message.h" +#include "CSDIO_Shared.h" + +/* we could have made this class multiply inherit from CSDIO and CSDIO_Shared, but + * we would have to use virtual inheritance and depending on the compiler used, + * there could be a performance penalty (though it would still be dwarfed by the + * cost of message-passing and disk i/o) + */ + +extern MPIFS* MPIFS_global_obj; +extern int SUBCHUNK_SIZE; + +CSDIO_Shared::CSDIO_Shared(int *schema_string, int schema_size, int world_rank, + int comp_app_num,int comp_app_size , App_Info *app_info) +: CSDIO(schema_string, schema_size, world_rank, comp_app_num, + comp_app_size, app_info) +{ + + compute_chunk_ = new Chunk(); + current_chunk_ = new Chunk(); + subchunk_ = new Chunk(); + current_array_id_ = -1; + if ((op_type_ == RESTART)||(op_type_ == GENERAL_READ)|| + (op_type_ == READ_TIMESTEP)) + read_op_ = YES; + else + read_op_ = NO; + + /* Send the IO app info to the compute nodes */ + receive_io_app_info(); + + /* We need to set the following variables so that continue_io()* + * would start the I/O of the first subchunk automatically */ + contiguous_ = NO; + current_array_id_ = -1; + current_chunk_id_ = 0; + num_of_chunks_ = -1; /* This will cause get_next_chunk() to fail */ + current_subchunk_id_ = 0; + num_of_subchunks_ = -1; /* Causes get_next_subchunk() to fail */ + status_flag_ = START; + continue_io(); +} + +CSDIO_Shared::~CSDIO_Shared() +{ + if (subchunk_) delete subchunk_; + if (compute_chunk_) delete compute_chunk_; + subchunk_ = compute_chunk_ = NULL; +} + +Boolean CSDIO_Shared::get_next_array(){ + current_array_id_++; + if (current_array_id_ < num_of_arrays_){ + make_subchunks_ = -1; + current_array_ = find_array(current_array_id_); + nat_chunked_ = current_array_->nat_chunked(); + sub_chunked_ = current_array_->sub_chunked(); + array_rank_ = current_array_->rank(); + + if (array_rank_ > max_rank_){ + realloc_schema_bufs(array_rank_); + } + num_of_chunks_ = current_array_->layout(IO_NODE)->total_elements(); + current_chunk_id_ = -1; + if (nat_chunked_ && !sub_chunked_) + contiguous_ = YES; /* No need to use derived datatypes */ + else + contiguous_ = NO; /* Have to use derived datatypes */ + + bytes_to_go_ = 0; + current_subchunk_id_ = -1; + return YES; + } else + return NO; +} + + +Boolean CSDIO_Shared::get_next_chunk() +{ + int *ptr; + + if (!current_array_) return NO; + current_chunk_id_ = current_array_->get_next_index(current_chunk_id_, + my_io_rank_, + num_io_nodes_); + if (current_chunk_id_ < num_of_chunks_){ + current_chunk_->set_data_ptr(NULL); + current_chunk_->init(current_array_, current_chunk_id_, + IO_NODE, NO_ALLOC); + tag_ = current_chunk_id_*1000; + if (contiguous_){ + bytes_to_go_ = current_chunk_->total_size_in_bytes(); + current_chunk_->set_data_ptr(mem_buf_); + ptr = schema_bufs_[0]; + *ptr++ = current_array_id_; + *ptr++ = current_chunk_id_; + *ptr++ = (int) nat_chunked_; + *ptr++ = (int) contiguous_; + *ptr++ = op_type_; + *ptr++ = 0; + *ptr++ = 0; + compute_chunk_overlaps(current_array_, current_chunk_); + } + else { + if (!sub_chunked_ && (make_subchunks_ == -1)){ + current_array_->make_sub_chunks(current_chunk_); + make_subchunks_ = 1; + } + num_of_subchunks_ = current_array_->layout(SUB_CHUNK)->total_elements(); + current_subchunk_id_ = -1; + } + return YES; + } + else + return NO; +} + + +/* This should not be called for the contiguous_ case */ +Boolean CSDIO_Shared::get_next_subchunk() +{ + current_subchunk_id_++; + if (current_subchunk_id_ < num_of_subchunks_){ + subchunk_->set_data_ptr(NULL); + subchunk_->init(current_chunk_, current_subchunk_id_, NO_ALLOC); + bytes_to_go_ = subchunk_->total_size_in_bytes(); + + if (bytes_to_go_ < mem_buf_size_) + realloc_mem_bufs(bytes_to_go_); + + subchunk_->set_data_ptr(mem_buf_); + return YES; + } + else + return NO; +} + + +void CSDIO_Shared::start_subchunk_io() +{ + int *ptr; + + if (contiguous_){ + ptr = schema_bufs_[0]; + ptr[6] = min(SUBCHUNK_SIZE, bytes_to_go_); + if (read_op_) read_data(mem_buf_, ptr[6]); + if (read_op_) + nb_send_message((void *)mem_buf_, ptr[6], MPI_CHAR, dest_ids_[0], + tag_+CHUNK_DATA_FROM_IO, MPI_COMM_WORLD, &requests_[0]); + else + nb_receive_message((void *)mem_buf_, ptr[6], MPI_CHAR, dest_ids_[0], + tag_+CHUNK_DATA_TO_IO, MPI_COMM_WORLD, &requests_[0]); + ptr[5] += ptr[6]; /* Offset of the next subchunk */ + bytes_to_go_ -= ptr[6]; + status_flag_ = WAITING; + tag_ += 10; + + } else { + compute_chunk_overlaps(current_array_, subchunk_); + + compute_schemas(current_array_, subchunk_, compute_chunk_, current_array_id_); + if (read_op_){ + read_data(subchunk_); + send_data_to_compute_nodes(subchunk_, tag_); + } + else + receive_data_from_compute_nodes(subchunk_, tag_); + status_flag_ = WAITING; + tag_ += 10; + } +} + + +Boolean CSDIO_Shared::test_subchunk_io() +{ + int flag; + MPI_Testall(num_overlaps_, requests_, &flag, statuses_); + if (flag) { + status_flag_ = START; + if (!read_op_) + if (contiguous_) + write_data(mem_buf_, schema_bufs_[0][6], 1); + else + write_data(subchunk_); + + if (!contiguous_) free_datatypes(); + return YES; + } + return NO; +} + + +/* Return YES, if I/O is complete */ +Boolean CSDIO_Shared::continue_io() +{ + if (status_flag_ == START){ + if (!start_next_subchunk_io()) return YES; /* IO completed */ + } else if (status_flag_ == WAITING){ + if (test_subchunk_io()) + if (!start_next_subchunk_io()) return YES; /* IO done */ + } else { + printf("Error - Invalid status_flag value \n"); + exit(11); + } + return NO; +} + +/* Return yes if you can start the io of another subchunk */ +Boolean CSDIO_Shared::start_next_subchunk_io() +{ + if (contiguous_){ + if (bytes_to_go_ <= 0){ + while(!get_next_chunk()){ + if (!get_next_array()) return NO; + } + /* Since we might be looking at another array */ + if (!contiguous_) get_next_subchunk(); + } + + start_subchunk_io(); + } else { + + if (!get_next_subchunk()){ + /* We have finished this chunk */ + while(!get_next_chunk()){ + if (!get_next_array()) return NO; + } + if (!contiguous_) get_next_subchunk(); + } + + start_subchunk_io(); + } + return YES; +} + diff --git a/src/Panda/CSDIO_Shared.h b/src/Panda/CSDIO_Shared.h new file mode 100644 index 0000000..08e9fd8 --- /dev/null +++ b/src/Panda/CSDIO_Shared.h @@ -0,0 +1,33 @@ +#ifndef CSDIO_Shared_dot_h +#define CSDIO_Shared_dot_h + +#include "CSDIO.h" +class Chunk; + +class CSDIO_Shared : public CSDIO +{ + protected: + int current_array_id_; + int status_flag_; + Chunk *subchunk_; + Chunk *compute_chunk_; + Boolean read_op_; + int bytes_to_go_; + int make_subchunks_; + int tag_; + + Boolean get_next_chunk(); + Boolean get_next_array(); + Boolean get_next_subchunk(); + Boolean start_next_subchunk_io(); + void start_subchunk_io(); + Boolean test_subchunk_io(); + + public: + CSDIO_Shared(int*,int,int,int,int, App_Info*); + virtual ~CSDIO_Shared(); + virtual Boolean continue_io(); +}; + +#endif + diff --git a/src/Panda/Chunk.C b/src/Panda/Chunk.C new file mode 100644 index 0000000..d6fd028 --- /dev/null +++ b/src/Panda/Chunk.C @@ -0,0 +1,692 @@ +#include "definitions.h" +#include "Chunk.h" +#include "Array.h" +#include <malloc.h> + + +Chunk::Chunk() +{ + base_ = stride_ = size_ = NULL; + array_ = NULL; + chunk_ = NULL; + data_ptr_ = NULL; + stencil_width_ = 0; +} + + +/* This constructor is used to create a chunk given array information */ +Chunk::Chunk(Array *array, int chunk_id, int node_type, DataStatus data_status) +{ + do_init(array, chunk_id, node_type, data_status); +} + +/* Re-initialize an already created chunk object */ +void Chunk::init(Array *array, int chunk_id, int node_type, DataStatus data_status) +{ + clear(); + do_init(array, chunk_id, node_type, data_status); +} + +void Chunk::do_init(Array *array, int chunk_id, int node_type, + DataStatus data_status) +{ + int *stride, *base; + + /* Initialize the instance variables */ + array_ = array; + chunk_ = NULL; + chunk_id_ = chunk_id; + am_subchunk_ = NO; + element_size_ = array->element_size(); + + stride = (int *) malloc(sizeof(int)*array->rank()); + base = (int *) malloc(sizeof(int)*array->rank()); + for(int i=0; i < array->rank(); i++){ stride[i] = 1; base[i] = 0; } + + RegularDistribution *layout=(RegularDistribution *)(array->layout(node_type)); + calculate_base_size_stride(array->rank(), base, array->size(), stride, + layout->layout(), layout->distribution(), + layout->block_dist(), chunk_id); + + /* check if we have to allocate the data space */ + switch(data_status) { + case ALLOC: + data_ptr_ = (char *)malloc(total_size_in_bytes()); + data_status_ = data_status; + stencil_width_ = 0; + break; + + case NO_ALLOC: + data_ptr_ = NULL; + data_status_ = data_status; + stencil_width_ = 0; + break; + + default: + printf("Unsupported \n"); + break; + } +} + +/* This creates a subchunk , given the chunk and subchunk_id */ +Chunk::Chunk(Chunk* mega_chunk, int sub_chunkid, DataStatus data_status) +{ + do_init(mega_chunk, sub_chunkid, data_status); +} + +/* Re-initialize an already created subchunk obj */ +void Chunk::init(Chunk* mega_chunk, int sub_chunkid, DataStatus data_status) +{ + clear(); + do_init(mega_chunk, sub_chunkid, data_status); +} + + +void Chunk::do_init(Chunk* mega_chunk, int sub_chunkid, DataStatus data_status) +{ + chunk_id_ = sub_chunkid; + element_size_ = mega_chunk->element_size(); + array_ = mega_chunk->array(); + chunk_ = mega_chunk; + am_subchunk_ = YES; + + RegularDistribution *layout=(RegularDistribution *)(array_->layout(SUB_CHUNK)); + calculate_base_size_stride(mega_chunk->rank(), mega_chunk->base(), + mega_chunk->size(), mega_chunk->stride(), + layout->layout(), layout->distribution(), + layout->block_dist(), sub_chunkid); + /* check if we have to allocate the data space */ + switch(data_status) { + case ALLOC: + data_ptr_ = (char *)malloc(total_size_in_bytes()); + data_status_ = data_status; + stencil_width_ = 0; + break; + + case NO_ALLOC: + data_ptr_ = NULL; + data_status_ = data_status; + stencil_width_ = 0; + break; + + default: + data_ptr_ = NULL; + printf("Unsupported \n"); + break; + } +} + +Chunk::~Chunk() +{ + if (base_) delete base_; + if (stride_) delete stride_; + + /* Delete the data buffer only if we allocated it in the first place */ + if ((data_status_ == ALLOC) && data_ptr_) delete data_ptr_; +} + + +void Chunk::clear() +{ + if (base_) free (base_); + if (stride_) free (stride_); + if (data_ptr_) free( data_ptr_); + if (size_) free(size_); + base_ = size_ = stride_ = NULL; + data_ptr_ = NULL; +} + +/* This function takes as input the information about the global + * Array and returns the overlapping compute node chunk indices + * via a singly linked list. + * + * Currently this function can only handle BLOCK,* arrays (Needs + * to be extended for the CYCLIC case) + */ +void Chunk::chunk_overlaps(Array *global_array, int* num_overlaps, + int *ret_list, int node_type) +{ + RegularDistribution *layout1 = + (RegularDistribution *)global_array->layout(node_type); + ArrayLayout *layout= layout1->layout(); + int layout_rank = layout->rank(); + int *overlap_base = (int *)malloc(sizeof(int)*layout_rank); + int *overlap_size = (int *)malloc(sizeof(int)*layout_rank); + + /* Find out the list of possible overlaps */ + compute_first_last_chunk(global_array->rank(), global_array->size(), + layout, layout1->distribution(), layout1->block_dist(), + overlap_base, overlap_size); +#ifdef DEBUG + printf("In chunk_overlaps\n"); + for(int i=0;i<layout_rank;i++) + printf("base[%d] = %d size[%d] = %d\n", i, overlap_base[i], i, overlap_size[i]); +#endif + layout->indices_list(overlap_base, overlap_size, num_overlaps, ret_list); + free(overlap_base); + free(overlap_size); +} + + +/* This function isn't general enough. It implicitly assumes that the I/O + * chunks are distributed using only BLOCK.* distributions. Also the + * compute node chunks are assumed to be distributed using only + * BLOCK,* (can be extended to support CYLCIC later) + * + * Function assumes that the memory for the return paramters + * overlap_base and overlap_size have been allocated + */ +void Chunk::compute_first_last_chunk(int array_rank, int *array_size, + ArrayLayout *layout, Distribution *dist, + Block_Distribution block_dist, + int *overlap_base, int *overlap_size) +{ + /* Validation of input data */ + if (!(layout->valid_distribution(array_rank, dist))) + { + printf("Invalid distribution in compute_first_last_chunk\n"); + exit(1); + } + + /* Verify to see if we are dealing with BLOCK,* case only */ + for(int i=0;i<layout->rank();i++) + { + if (dist[i] == CYCLIC) + { + printf("Cyclic schema not yet supported\n"); + exit(2); + } + } + + for(i=0; i<array_rank;i++) + { + if (stride_[i] != 1) + { + printf("Cyclic schema not yet supported\n"); + exit(2); + } + } + + + /* Now we can get down to business */ + int *overlap_last = (int*)malloc(sizeof(int)*layout->rank()); + int layout_idx=0, array_idx; + int def_chunk_size,rem,tmp,last; + + for(array_idx=0;array_idx < array_rank; array_idx++) + { + switch(dist[array_idx]) + { + case NONE: + break; + + case CYCLIC: + printf("Cyclic schema not yet supported\n"); + exit(3); + break; + + /* Need to verify this stuff - especially the NAS stuff */ + case BLOCK: + switch(block_dist) + { + case HPF: + def_chunk_size = (array_size[array_idx]+layout->size(layout_idx)-1) + / (layout->size(layout_idx)); + overlap_base[layout_idx] = base_[array_idx] + / def_chunk_size; + overlap_last[layout_idx] = (base_[array_idx]+size_[array_idx] -1) + / def_chunk_size; + break; + + case NAS: + def_chunk_size = array_size[array_idx] + / layout->size(layout_idx); + rem = array_size[array_idx] + % layout->size(layout_idx); + if (rem == 0) + { + /* perfect distribution */ + overlap_base[layout_idx] = base_[array_idx] + / def_chunk_size; + overlap_last[layout_idx] = (base_[array_idx] + + size_[array_idx] -1) + / def_chunk_size; + } + else + { + /* first "rem" blocks have "def_chunk+1" elements */ + tmp = (def_chunk_size+1)*rem; + if (base_[array_idx] < tmp) + { + overlap_base[layout_idx] = base_[array_idx] + / (def_chunk_size + 1); + } + else + { + overlap_base[layout_idx] = ((base_[array_idx] - tmp) + / def_chunk_size) + rem; + } + + last = base_[array_idx] + size_[array_idx] -1; + if (last < tmp) + { + overlap_last[layout_idx] = last / (def_chunk_size+1); + + } + else + { + overlap_last[layout_idx] = ((last - tmp) + / def_chunk_size) + rem; + } + } + break; + + default: + printf("Unsupported block distribution\n"); + exit(2); + break; + } + overlap_size[layout_idx] = overlap_last[layout_idx] + - overlap_base[layout_idx] + 1; + layout_idx++; + break; + + default: + printf("Unsupported distribution\n"); + exit(3); + break; + } + + } + + free(overlap_last); + return; +} + + + + +int Chunk::total_size_in_bytes() +{ + return (total_size_in_elements()*element_size_); +} + + + +int Chunk::total_size_in_elements() +{ + return total_elements(); +} + + +int Chunk::chunk_id(){return chunk_id_;} + + +void * Chunk::data_ptr(){return data_ptr_;} + + + +/* This is not a method. It is an generalized inline function to + * calculate the overlap between two chunks. The input parameters + * are rank,base,stride,size of the two arrays and the pointers to + * the base,strides and sizes of the resultant chunk. The functions + * assumes that the rank of the input arrays are equal + * + * This function also assumes that the memory for the return values + * r_base, r_stride, rsize have already been allocated. + */ +inline void determine_overlap(int rank, int *c1_base, int* c1_size, + int* c1_stride, + int* c2_base, int* c2_size, int* c2_stride, + int* r_base, int* r_size, int* r_stride) +{ + + int tmp_base,tmp_size,n; + + for(int i=0; i< rank;i++) + { + /* Compute overlap in each dimension */ + if ((c1_stride[i] == 1) && (c2_stride[i] == 1)) + { + /* Simplest case + * r_base = max(c1_base, c2_base) + * r_size = max( min(c1_base+c1_size, c2_base+c2_size)-r_base, 0); + */ + r_base[i] = max(c1_base[i], c2_base[i]); + r_size[i] = max((min(c1_base[i]+c1_size[i], c2_base[i]+c2_size[i]) + - r_base[i]), 0); + r_stride[i] = 1; + } + else if (c1_stride[i] == 1) + { + /* Not so simple - this needs to be verified + * tmp_B = max(c1_base,c2_base) + * B = tmp_B + (N - ((tmp_B - c2_base)%N))%N + * U = min(c1_base+(c1_size-1), c2_base+(c2_size-1)*N) - B + * if (U < 0) the no overlap else r_size = U/N + 1 + */ + n = c2_stride[i]; + tmp_base = max(c1_base[i], c2_base[i]); + r_base[i] = tmp_base + (n -((tmp_base - c2_base[i])%n))%n; + tmp_size = min(c1_base[i]+(c1_size[i]-1), c2_base[i]+(c2_size[i]-1)*n); + if (tmp_size < 0) + { + /* no overlap */ + r_size[i] = 0; + r_stride[i] = 1; + } + else + { + r_size[i] = tmp_size / n + 1; + r_stride[i] = n; + } + } + else if (c2_stride[i] == 1) + { + /* Similar to the previous case */ + n = c1_stride[i]; + tmp_base = max(c1_base[i], c2_base[i]); + r_base[i] = tmp_base + (n -((tmp_base - c1_base[i])%n))%n; + tmp_size = min(c1_base[i]+(c1_size[i]-1)*n, c2_base[i]+(c2_size[i]-1)); + if (tmp_size < 0) + { + /* no overlap */ + r_size[i] = 0; + r_stride[i] = 1; + } + else + { + r_size[i] = tmp_size / n + 1; + r_stride[i] = n; + } + } + else if (c1_stride[i] = c2_stride[i]) + { + /* Can do this one later */ + } + else + { + /* I give up */ + } + } +#ifdef DEBUG + /* Debugging output */ + printf ("In determine overlap rank= %d\n", rank); + int k; + for(k=0;k<rank;k++) + printf("%d %d %d %d %d %d %d %d %d\n", c1_base[k], c1_size[k], c1_stride[k], + c2_base[k], c2_size[k], c2_stride[k], + r_base[k], r_size[k], r_stride[k]); +#endif + return; +} + + +void Chunk::compute_overlap(Chunk *compute_chunk, int *overlap_base, + int *overlap_size, int *overlap_stride) +{ + determine_overlap(rank_, base_, size_, stride_, + compute_chunk->base(), + compute_chunk->size(), + compute_chunk->stride(), + overlap_base, + overlap_size, + overlap_stride); +} + + +int* Chunk::base(){return base_;} +int* Chunk::size(){return size_;} +int* Chunk::stride(){return stride_;} + +int Chunk::element_size() { return element_size_; } +/* This function needs to be verified when the stride is not 1 */ +void Chunk::base_offset(int *base, void **ptr) +{ + int base_offset = 0; + int offset=1; + + for(int i=rank_ - 1; i>= 0; i--) + { + base_offset += ((base[i]-base_[i]) / stride_[i])*offset; + offset *= size_[i]; + } + base_offset *= element_size_; + *ptr = (char *)data_ptr_ + base_offset; +} + +void Chunk::convert_from_number_to_index(int num, int *result) +{ + int i,j, product=1; + + for(i=0;i<rank_;i++) + { + product=1; + for(j=i+1; j< rank_;j++) product *= size_[j]; + result[i] = num / product; + num -= num/product * product; + } +} + + +/* This method calculates the rank, base, stride of the chunk * + * (subchunk), given the dimensions of the array (chunk) and its * + * layout, distribution and the chunk (subchunk index) */ +void Chunk::calculate_base_size_stride(int rank, int* old_base, + int* old_size, int* old_stride, + ArrayLayout *layout, Distribution *dist, + Block_Distribution block_dist, int id) +{ + int *chunk_index=NULL; + int idx=0, layout_idx=0; + int default_size, rem; + + chunk_index = layout->convert_from_number_to_index(id); + rank_ = rank; + size_ = (int *) malloc(sizeof(int)*rank); + base_ = (int *) malloc(sizeof(int)*rank); + stride_ = (int *) malloc(sizeof(int)*rank); + + + /* Verify if it is possible to distribute the array (subchunk) */ + if (!(layout->valid_index(chunk_index))) + { + printf("Invalid chunk index %d in compute_base_size_stride\n", id); + exit(1); + } + if (!(layout->valid_distribution(rank, dist))) + { + printf("Unable to distribute array in compute_base_size_stride\n"); + exit(2); + } + + for(idx=0; idx < rank; idx++) + { + switch(dist[idx]) + { + case NONE: + base_[idx] = old_base[idx]; + size_[idx] = old_size[idx]; + stride_[idx] = old_stride[idx]*1; + break; + + case CYCLIC: + base_[idx] = old_base[idx] + chunk_index[layout_idx]*old_stride[idx]; + size_[idx] = (old_size[idx] - chunk_index[layout_idx] + + layout->size(layout_idx)-1)/ layout->size(layout_idx); + stride_[idx] = layout->size(layout_idx) * old_stride[idx]; + layout_idx++; + break; + + case BLOCK: + switch(block_dist) + { + case HPF: + default_size = (old_size[idx] + layout->size(layout_idx)-1) + /layout->size(layout_idx); + base_[idx] = old_base[idx] + default_size * + chunk_index[layout_idx] *old_stride[idx]; + size_[idx] = default_size; + stride_[idx] = old_stride[idx]*1; + /* The last chunk may be smaller */ + if (chunk_index[layout_idx] ==(layout->size(layout_idx)-1)) + { + size_[idx] = old_size[idx] - + (default_size * chunk_index[layout_idx]); + } + break; + + case NAS: + default_size = old_size[idx] / layout->size(layout_idx); + rem = old_size[idx] % layout->size(layout_idx); + if (chunk_index[layout_idx] < rem) + { + base_[idx] = old_base[idx] + (chunk_index[layout_idx] + + chunk_index[layout_idx]*default_size) + *old_stride[idx]; + size_[idx] = default_size + 1; + } + else + { + base_[idx] = old_base[idx] + (rem + + chunk_index[layout_idx]*default_size) + *old_stride[idx]; + size_[idx] = default_size; + } + stride_[idx] = old_stride[idx] * 1; + break; + + + default: + printf("Unsupported Block Distribution specified\n"); + exit(3); + break; + } + layout_idx++; + break; + + default: + printf("Unsupported Distribution specified\n"); + exit(3); + break; + } + } + + free(chunk_index); + return; +} + +Array* Chunk::array(){return array_;} + +Boolean Chunk::am_subchunk(){return am_subchunk_;} + +void Chunk::copy_base_size_stride(int *base, int *size, int *stride) +{ + for(int i=0; i< rank_; i++){ + base[i] = base_[i]; + size[i] = size_[i]; + stride[i] = stride_[i]; + } + } + + + +/* This assumes that all the strides are 1 - i.e no cyclic */ +void Chunk::make_datatype(int *overlap_base, int *overlap_size, + int *overlap_stride, void **ptr, + MPI_Datatype *return_data_type) +{ + + MPI_Datatype *tmp_types = (MPI_Datatype *) malloc(sizeof(MPI_Datatype) * rank_); + int i,j , offset = 1; + int base_offset = 0; + int *size, *base; + Boolean allocate; + + // If there is a ghost region + int *array_size = array_->size(); + int bound; + if (stencil_width_ > 0) { + size = (int *)malloc(sizeof(int) * rank_); + base = (int *)malloc(sizeof(int) * rank_); + for (i=0; i<rank_; i++) { + bound = base_[i] + size_[i]; + base[i] = max(base_[i] - stencil_width_, 0); + bound = min(bound + stencil_width_, array_size[i]); + size[i] = bound - base[i]; + } + allocate = YES; + //printf("##### stencil %d base %d %d %d size %d %d %d\n", stencil_width_, base[0], base[1], base[2], size[0], size[1], size[2]); + } else { + size = size_; + base = base_; + allocate = NO; + } + + MPI_Type_contiguous(element_size_, MPI_CHAR, &tmp_types[rank_-1]); + if (overlap_stride[rank_ -1] != 1) + { + printf("error - stride is %d", overlap_stride[rank_ -1]); + exit(10); + } + MPI_Type_vector(overlap_size[rank_-1], 1, 1, tmp_types[rank_-1], &tmp_types[rank_-2]); + for(i=rank_-1; i > 0; i--) + { + offset=1; + for(j=i;j <rank_; j++) offset *= size[j]; + if (overlap_stride[i-1] != 1) + { + printf("error - stride is %d\n", overlap_stride[i-1]); + exit(10); + } + if (i != 1){ + + MPI_Type_hvector(overlap_size[i-1],1,offset*element_size_, + tmp_types[i-1], + &tmp_types[i-2]); + } + else + MPI_Type_hvector(overlap_size[i-1],1,offset*element_size_, + tmp_types[i-1], + return_data_type); + } + MPI_Type_commit(return_data_type); + offset=1; + for(i=rank_-1;i >= 0; i--) + { + base_offset += (overlap_base[i] - base[i])*offset; + offset *= size[i]; + } + + *ptr = data_ptr_ + base_offset*element_size_; + free (tmp_types); + if (allocate) { + free(size); + free(base); + } +} + + +/* Old data buffer should be freed by someother function */ +void Chunk::set_data_ptr(char *data_ptr){ + data_ptr_ = data_ptr; +} + +void Chunk::set_stencil_width(int stencil_width){ + stencil_width_ = stencil_width; +} + +Chunk::Chunk(Array *array, int *base, int *size) +{ + array_ = array; + rank_ = array->rank(); + element_size_ = array->element_size(); + chunk_id_ = 0; + am_subchunk_ = NO; + + base_ = copy_int_list(rank_, base); + size_ = copy_int_list(rank_, size); + stride_ = (int *)malloc(sizeof(int) * rank_); + for (int i=0; i<rank_; i++) stride_[i] = 1; + data_status_ = NO_ALLOC; data_ptr_ = NULL; +} diff --git a/src/Panda/Chunk.h b/src/Panda/Chunk.h new file mode 100644 index 0000000..523a7d1 --- /dev/null +++ b/src/Panda/Chunk.h @@ -0,0 +1,68 @@ +#ifndef Chunk_dot_h +#define Chunk_dot_h + +#include "mpi.h" +#include "List.h" +#include "ArrayLayout.h" + +class Array; + + +class Chunk : public Template, public Linkable { + protected: + int *base_; + int *stride_; + int chunk_id_; /* This should be unique */ + int element_size_; + Array* array_; + Chunk* chunk_; + char *data_ptr_; + int stencil_width_; + DataStatus data_status_; + Boolean am_subchunk_; + + + void compute_first_last_chunk(int, int*, + ArrayLayout*,Distribution*, + Block_Distribution, int*, + int*); + void do_init(Array*,int,int, DataStatus); + void do_init(Chunk*,int,DataStatus); + void clear(); + + public: + Chunk(); + Chunk(Array*,int*,int*); + Chunk(Array*,int,int,DataStatus); + Chunk(Chunk*, int, DataStatus); + void init(Array*,int,int,DataStatus); + void init(Chunk*,int,DataStatus); + virtual ~Chunk(); + void chunk_overlaps(Array *, int*, int*, int); + int total_size_in_bytes(); + int total_size_in_elements(); + int chunk_id(); + void *data_ptr(); + void set_data_ptr(char *); + void set_stencil_width(int); + int* base(); + int* stride(); + int* size(); + int element_size(); + void base_offset(int*, void**); + void compute_overlap(Chunk*,int*,int*,int*); + void convert_from_number_to_index(int,int*); + void calculate_base_size_stride(int, int*, int*, int*, + ArrayLayout*, Distribution*, + Block_Distribution, int); + Array* array(); + Boolean am_subchunk(); + void copy_base_size_stride(int*,int*, int*); + void make_datatype(int*,int*,int*,void**,MPI_Datatype*); + +}; + +#endif + + + diff --git a/src/Panda/Collective_IO.C b/src/Panda/Collective_IO.C new file mode 100644 index 0000000..118afe6 --- /dev/null +++ b/src/Panda/Collective_IO.C @@ -0,0 +1,25 @@ +#include "definitions.h" +#include "Collective_IO.h" + +Collective_IO::Collective_IO(){} + +Collective_IO::~Collective_IO() +{ +} + +Boolean Collective_IO::continue_io() +{ + printf("This function should not be executed\n"); + return YES; +} + +void Collective_IO::start_to_finish(Boolean part_time_io, Array *array) +{ + printf("This function should not be executed\n"); +} + +void Collective_IO::compute_node_io_loop(Array *array) +{ + printf("This function should not be executed\n"); +} + diff --git a/src/Panda/Collective_IO.h b/src/Panda/Collective_IO.h new file mode 100644 index 0000000..aa351a7 --- /dev/null +++ b/src/Panda/Collective_IO.h @@ -0,0 +1,18 @@ +#ifndef Collective_IO_dot_h +#define Collective_IO_dot_h + +#include "List.h" +class Array; + +class Collective_IO : public Linkable{ + public: + Collective_IO(); + virtual ~Collective_IO(); + virtual Boolean continue_io(); + virtual void start_to_finish(Boolean, Array*); + virtual void compute_node_io_loop(Array*); +}; + +#endif + + diff --git a/src/Panda/List.C b/src/Panda/List.C new file mode 100644 index 0000000..8861a6f --- /dev/null +++ b/src/Panda/List.C @@ -0,0 +1,175 @@ +#include "definitions.h" +#include "List.h" + +Cell::Cell() +{ + item_ = NULL; + next_ = NULL; + prev_ = NULL; +} + +Cell::Cell(Linkable *new_item) +{ + item_ = new_item; + next_ = NULL; + prev_ = NULL; +} + +Cell::Cell(Linkable *new_item, Cell *prev) +{ + item_ = new_item; + prev_ = prev; + next_ = NULL; +} + +Cell::Cell(Linkable *new_item, Cell *next, Cell *prev) +{ + item_ = new_item; + next_ = next; + prev_ = prev; +} + + +Cell::~Cell() +{ + next_ = NULL; + prev_ = NULL; + item_ = NULL; +} + + +Linkable* Cell::item(){return item_;} + +Cell* Cell::next(){return next_;} + +Cell* Cell::prev(){return prev_;} + +void Cell::set_next(Cell *next) {next_ = next;} + +void Cell::set_prev(Cell *prev) {next_ = prev;} + +/*----------------------------------------------------*/ + +List::List() +{ + head_ = NULL; + tail_ = NULL; + old_search_val_ = NULL; +} + +List::~List() +{ + Cell* ptr = head_; + Cell* tmp; + + while(ptr != NULL) + { + tmp = ptr->next(); + delete ptr; + ptr = tmp; + } + head_ = NULL; + tail_ = NULL; + old_search_val_ = NULL; +} + + +/* Simply add to the beginning of the list */ +void List::insert(Linkable* new_item) +{ + add_first(new_item); +} + + +/* Add to the end of the list */ +void List::add_last(Linkable *new_item) +{ + Cell* tmp; + + if ((tail_ == NULL) && (head_ == NULL)) + { + /* The list is empty */ + tmp = new Cell(new_item); + tail_ =tmp; + head_ = tmp; + return; + } + else if ((tail_ != NULL) && (head_ != NULL)) + { + tmp = new Cell(new_item , tail_); + tail_->set_next(tmp); + tail_ = tmp; + return; + } + else + { + printf("Error in List obj\n"); + } + +} + +/* Add to the beginning of the list */ +void List::add_first(Linkable *new_item) +{ + Cell *tmp; + if ((tail_ == NULL) && (head_ == NULL)) + { + /* The list is empty */ + tmp = new Cell(new_item); + head_ = tmp; + tail_ = tmp; + return; + } + else if ((tail_ != NULL) && (head_ != NULL)) + { + tmp = new Cell(new_item, head_); + head_ = tmp; + return; + } + else + { + printf("Error in List obj\n"); + } +} + + +/* Remove the item from the List */ +void List::remove(Linkable *new_item) +{ + Cell *ptr = head_, *prev, *next; + while (ptr) + { + if (ptr->item()==new_item) + { + prev = ptr->prev(); + next = ptr->next(); + if ((prev != NULL) && (next != NULL)) + { + prev->set_next(next); + next->set_prev(prev); + delete ptr; + } + else if (prev != NULL) + { + /* ptr must be the last item */ + prev->set_next(NULL); + tail_ = prev; + delete ptr; + } + else if (next != NULL) + { + /* ptr must be the first item */ + next->set_prev(NULL); + head_ = next; + delete ptr; + } + else + { + head_ = tail_ = NULL; + delete ptr; + } + return; + } + else ptr = ptr->next(); + } +} diff --git a/src/Panda/List.h b/src/Panda/List.h new file mode 100644 index 0000000..1d162d4 --- /dev/null +++ b/src/Panda/List.h @@ -0,0 +1,61 @@ +#ifndef Link_dot_h +#define Link_dot_h + + +/* This is the dummy base class for all items * + * to be placed in a linked list. It would have * + * been cleaner to use Templates but support for * + * templates varies with different compilers and * + * the problem of code blow up etc exists. * + */ +class Linkable +{ + public: + Linkable(){}; + virtual ~Linkable(){}; + +}; + + +/* The Cells contains a Linkable element and ptrs * + * to the next and previos cells * + */ +class Cell { + Linkable *item_; + Cell *next_; + Cell *prev_; + public: + Cell(); + Cell(Linkable*); + Cell(Linkable*,Cell*); + Cell(Linkable*,Cell*,Cell*); + ~Cell(); + Linkable* item(); + Cell* next(); + Cell* prev(); + void set_next(Cell*); + void set_prev(Cell*); +}; + + +/* The List class provides support for creating a * + * list and provides operations like inserting, * + * deleting elements to the beginning and the end * + * of the list * + */ +class List { + public: + Cell *head_; + Cell *tail_; + Cell *old_search_val_; /* result of the previous search */ + List(); + ~List(); + void insert(Linkable*); + void add_last(Linkable*); + void add_first(Linkable*); + void remove(Linkable*); +}; + +#endif + + diff --git a/src/Panda/MPIFS.C b/src/Panda/MPIFS.C new file mode 100644 index 0000000..e8b56ae --- /dev/null +++ b/src/Panda/MPIFS.C @@ -0,0 +1,971 @@ +#include "definitions.h" +#include "MPIFS.h" +#include "Collective_IO.h" +#include "Simple_IO.h" +#include "Array.h" +#include "Chunk.h" +#include "message.h" +#define Max_Open_Files 1000 + + +#include "external/IEEEIO/src/Arch.h" + +extern "C" { + IOFile IEEEopen(char *,char *); + int IOclose(IOFile); +} + +int BRANCHING_FACTOR=8; +int SUBCHUNK_SIZE = 1048576; +MPIFS* MPIFS_global_obj; + +/* Notes,Hacks,Assumptions: + * - io_app_info_ and app_info_ point to the same object on the I/O + * for the regular. This hack is to allow for code re-use in the + * part-time I/O case. + */ + +/* Constructor for the normal case - i.e no part-time I/O nodes */ +MPIFS::MPIFS(int node_type, int app_num, int relative_rank, int app_size, + int *world_ranks) +{ +#ifdef DEBUG + int abs_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &abs_rank); + printf("%d:node_type=%d, app_num=%d , relative_rank=%d, app_size=%d\n", + abs_rank, node_type, app_num, relative_rank, app_size); +#endif + do_init(node_type, app_num, relative_rank, app_size, world_ranks); +} + +/* Constructor for the normal case - i.e no part-time I/O nodes */ +MPIFS::MPIFS(int node_type, int app_num, int relative_rank, int app_size, + int *world_ranks, Boolean shared_flag) +{ + do_init(node_type, app_num, relative_rank, app_size, world_ranks); +} + + +/* Constructor for part-time I/O nodes */ +MPIFS::MPIFS(int node_type, int comp_rank, int comp_size, int *comp_world_ranks, + int io_rank, int io_size, int *io_world_ranks) +{ +#ifdef DEBUG + int abs_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &abs_rank); + printf("%d:node_type=%d, comp_rank=%d, comp_size=%d io_rank=%d io_size=%d\n", + abs_rank,node_type, comp_rank, comp_size, io_rank, io_size); +#endif + do_init(node_type, comp_rank, comp_size, comp_world_ranks, io_rank, io_size, + io_world_ranks); +} + +/* Initialize the file system object for the regular case (i.e no part-time I/O nodes) */ +void MPIFS::do_init(int node_type, int app_num, int relative_rank, int app_size, + int *world_ranks) +{ + MPI_Status status; + int tag, tmp; + + MPIFS_global_obj = this; + + if ((node_type != IO_NODE) && (node_type != COMPUTE_NODE)) + { + printf("Invalid node type in MPIFS::do_init(int,int,int,int,int*)\n"); + exit(10); + } + + /* Initialize the state */ + node_type_ = node_type; + + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank_); + + app_num_ = app_num; + app_rank_ = relative_rank; + app_size_ = app_size; + app_info_ = new App_Info(app_num, app_size, world_ranks); + comm_ = (MPI_Comm *) malloc(sizeof(MPI_Comm)); + MPI_Comm_split(MPI_COMM_WORLD, app_num_, app_rank_, comm_); + num_apps_ = num_apps_alive_ = global_barrier_count_ = 0; + current_max_app_num_ = -1; + compute_apps_info_ = NULL; + mem_buf_ = NULL; + num_open_files_ = 0; + for (int i=0; i<Max_Open_Files; i++) open_file_names_[i] = NULL; + + /* Part-time i/o stuff - unneeded in this case */ + io_app_num_ = io_app_rank_ = io_app_size_ = -1; + io_comm_ = NULL; + io_app_info_ = NULL; + + /* Broadcast the rank of the master I/O node. The strategy is * + * to send the info to node 0 and have it broadcast it */ + master_io_node_ = -1; + if (node_type_ == IO_NODE) + master_io_node_ = app_info_->get_master(); + + if (world_rank_ == 0) + { + if (master_io_node_ != 0) + receive_message((void *)&master_io_node_, 1, MPI_INT, MPI_ANY_SOURCE, + 1000, MPI_COMM_WORLD, &status); + } + else if (world_rank_ == master_io_node_) + { + send_message((void *)&master_io_node_, 1 , MPI_INT, 0, 1000, + MPI_COMM_WORLD); + } + MPI_Bcast((void *)&master_io_node_, 1, MPI_INT, 0, MPI_COMM_WORLD); + /* Now all nodes know who the master I/O node is */ + + if (node_type_ == IO_NODE) + { + mem_buf_size_ = 2*SUBCHUNK_SIZE; /* Factor of 2 - just to be safe */ + mem_buf_ = (char *) malloc(sizeof(char)*mem_buf_size_); + + io_app_info_ = app_info_; + io_node_main_loop(); + } + else if (node_type_ == COMPUTE_NODE) + { + /* Send the info about the compute application to the * + * master I/O node (only master I/O node has to do it) */ + if (am_master_compute_node()) + { + tag = app_num_*100+APP_INFO*10+SPECIAL; + send_message((void *)world_ranks, app_size, MPI_INT, + master_io_node_, tag, MPI_COMM_WORLD); + receive_message((void *)&tmp, 1, MPI_INT, master_io_node_, + tag, MPI_COMM_WORLD, &status); + } + MPI_Barrier(*comm_); + + /* Create an intra-comm with the I/O nodes. This stuff is * + * used only for implemneting barriers etc */ + MPI_Comm *inter_comm = (MPI_Comm *) malloc(sizeof(MPI_Comm)); + MPI_Comm *intra_com = (MPI_Comm *) malloc(sizeof(MPI_Comm)); + MPI_Intercomm_create(*comm_, 0, MPI_COMM_WORLD, + master_io_node_, app_num, inter_comm); + MPI_Intercomm_merge(*inter_comm, 1, intra_com); + app_info_->set_intra_comm(intra_com); + } + else + { + printf("Unsupported node type\n"); + exit(1); + } +} + +void MPIFS::do_init(int node_type, int comp_rank, int comp_size, int *comp_world_ranks, + int io_rank, int io_size, int *io_world_ranks) +{ + MPI_Group global_group, comp_group, io_group; + + + MPIFS_global_obj = this; + if ((node_type != PART_TIME_COMPUTE) && (node_type != PART_TIME_IO)) + { + printf("Incorrect initialization for node_type %d\n", node_type); + exit(10); + } + + /* Convention that logical I/O app gets app_num=0 and compute app get app_num=1 */ + node_type_ = node_type; + + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank_); + + app_num_ = 1; + app_rank_ = comp_rank; + app_size_ = comp_size; + master_io_node_ = io_world_ranks[0]; + comm_ = (MPI_Comm *)malloc(sizeof(MPI_Comm)); + MPI_Comm_group(MPI_COMM_WORLD, &global_group); + MPI_Group_incl(global_group, comp_size, comp_world_ranks, &comp_group); + MPI_Comm_create(MPI_COMM_WORLD, comp_group, comm_); + app_info_ = new App_Info(1, app_size_, comp_world_ranks); + + num_apps_ = 1; + num_apps_alive_ =1; + current_max_app_num_=-1; + global_barrier_count_ =0; + compute_apps_info_ = NULL; + mem_buf_ = NULL; + + io_app_num_ = 0; /* By convention */ + io_app_rank_ = io_rank; + io_app_size_ = io_size; + io_app_info_ = NULL; + + /* Everyone in MPI_COMM_WORLD must make this call */ + io_comm_ = (MPI_Comm*) malloc(sizeof(MPI_Comm)); + MPI_Comm_group(MPI_COMM_WORLD, &global_group); + MPI_Group_incl(global_group, io_size, io_world_ranks, &io_group); + MPI_Comm_create(MPI_COMM_WORLD, io_group, io_comm_); + + if (node_type_ == PART_TIME_IO) + { + mem_buf_size_ = 2*SUBCHUNK_SIZE; /* Factor of 2 - just to be safe */ + mem_buf_ = (char *) malloc(sizeof(char)*mem_buf_size_); + App_Info *app = new App_Info(1, comp_size, comp_world_ranks); + io_app_info_ = new App_Info(0, io_app_size_, io_world_ranks); + insert_compute_app(1, app); + } + +} + +MPIFS::~MPIFS() +{ + if (node_type_ == COMPUTE_NODE) + { + if (am_master_compute_node()) + send_message((void *)&app_num_, 1, MPI_INT, master_io_node_, + QUIT, MPI_COMM_WORLD); + } + + + if (app_info_) delete app_info_; + if (mem_buf_) free(mem_buf_); + + for (int i=0; i<num_open_files_; i++) { + free(open_file_names_[i]); + IOclose((IOFile)open_file_ptrs_[i]); + } + + if (compute_apps_info_){ + for(int i=0; i<=current_max_app_num_; i++) + if (compute_apps_info_[i]) delete compute_apps_info_[i]; + delete compute_apps_info_; + } + + app_info_ = io_app_info_ = NULL; + compute_apps_info_ =NULL; + if (comm_) + { + MPI_Comm_free(comm_); + free(comm_); + comm_= NULL; + } +} + + +Boolean MPIFS::am_compute_node() +{ + if (node_type_ == IO_NODE) return NO; + else return YES; +} + + +Boolean MPIFS::am_io_node() +{ + if ((node_type_ == IO_NODE) || (node_type_ == PART_TIME_IO)) + return YES; + else return NO; +} + +Boolean MPIFS::am_master_io_node() +{ + if (am_io_node() && (world_rank_ == io_app_info_->get_master())) + return YES; + else return NO; +} + +Boolean MPIFS::am_master_compute_node() +{ + if (am_compute_node() && (world_rank_ == app_info_->get_master())) + return YES; + else return NO; +} + + + + + +/* This is a highly restricted version of a broadcast function. The broadcast + * is performed using tree-structured communication, starting at relative + * node 0. The broadcast is implemented using tree-structured communication. + */ +void MPIFS::Broadcast(int node_type, void *buf, int count, + MPI_Datatype datatype, int tag) +{ + App_Info *app; + int my_rank = app_rank_, size; + int low, high, i, dest; + + if (node_type == COMPUTE_NODE) app = app_info_; + else app = io_app_info_; + size = app->app_size(); + + low = my_rank*BRANCHING_FACTOR+1; + high = (my_rank+1)*BRANCHING_FACTOR+1; + i = low; + +#ifdef DEBUG + printf("%d: Bcast low %d high %d size=%d\n", world_rank_,low, high,size); +#endif + /* Can use asynchronous sends */ + while ((i<size) && (i<high)) { + dest = app->world_rank(i); + send_message(buf, count, datatype, dest, tag, MPI_COMM_WORLD); + i++; + } +} + + +void MPIFS::io_node_main_loop() +{ + MPI_Status msg_status; + int msg_tag, msg_code, msg_src; + + while(1){ + wait_for_next_message(&msg_code, &msg_src, &msg_tag, &msg_status); + switch(msg_code){ + + case SPECIAL: + process_io_special_message(msg_src, msg_tag, &msg_status); + break; + + case ARRAYGROUP_SCHEMA: + start_collective_io(msg_src, msg_tag, &msg_status); + break; + + case ATTRIBUTE_SCHEMA: + start_attribute_io(msg_src, msg_tag, &msg_status); + break; + + case QUIT: + if (received_quit_message(msg_src, msg_tag, &msg_status)) + return; + break; + + default: + printf("Error - did not understand message code %d from %d with tag %d\n", + msg_code, msg_src, msg_tag); + break; + } + } +} + +void MPIFS::wait_for_next_message(int *msg_code, int *msg_src, int *msg_tag, + MPI_Status *msg_status) +{ + MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, msg_status); + *msg_tag = msg_status->MPI_TAG; + *msg_src = msg_status->MPI_SOURCE; + *msg_code = *msg_tag % 10; + return; +} + +void MPIFS::process_io_special_message(int msg_src, int msg_tag, + MPI_Status *status) +{ + int msg_code = (msg_tag / 10) % 10 ; /* Extract the second digit */ + int msg_len, app_num, *world_ranks; + MPI_Comm *inter_comm, *intra_com; + App_Info *app; + + switch (msg_code) + { + case APP_INFO: + MPI_Get_count(status, MPI_INT, &msg_len); + world_ranks = (int *) malloc(sizeof(int)*msg_len); + receive_message((void *)world_ranks, msg_len, MPI_INT, msg_src, + msg_tag, MPI_COMM_WORLD, status); + Broadcast(IO_NODE, world_ranks, msg_len, MPI_INT, msg_tag); + app_num = msg_tag / 100; + if (am_master_io_node()) + send_message((void *) &app_num, 1, MPI_INT, msg_src, + msg_tag, MPI_COMM_WORLD); + app = new App_Info(app_num, msg_len, world_ranks); + insert_compute_app(app_num, app); + /* Create the Intra communicator */ + inter_comm = (MPI_Comm *) malloc(sizeof(MPI_Comm)); + intra_com = (MPI_Comm *) malloc(sizeof(MPI_Comm)); + MPI_Intercomm_create(*comm_, 0, MPI_COMM_WORLD, + world_ranks[0], (msg_tag/100), inter_comm); + MPI_Intercomm_merge(*inter_comm, 0, intra_com); + app->set_intra_comm(intra_com); + free(world_ranks); + break; + + case APP_BARRIER: + /* This should be used very carefully when there are more than + * 1 compute application running + */ + receive_message((void *)&app_num,1, MPI_INT, msg_src, msg_tag, + MPI_COMM_WORLD, status); + Broadcast(IO_NODE, (void *) &app_num, 1, MPI_INT, msg_tag); + MPI_Barrier(*(find_compute_app(app_num)->intra_comm())); + break; + + case GLOBAL_BARRIER: + receive_message((void *)&app_num,1, MPI_INT, msg_src, msg_tag, + MPI_COMM_WORLD, status); + Broadcast(IO_NODE, (void *) &app_num, 1, MPI_INT, msg_tag); + global_barrier_count_++; + if (global_barrier_count_ == num_apps_alive_) + { + MPI_Barrier(MPI_COMM_WORLD); + global_barrier_count_ = 0; + } + break; + + case CLEANFILES: + receive_message((void *)&app_num,1, MPI_INT, msg_src, msg_tag, + MPI_COMM_WORLD,status); + Broadcast(IO_NODE, (void *) &app_num, 1, MPI_INT, msg_tag); + cleanfiles(app_num); + break; + + case FLUSHFILES: + receive_message((void *)&app_num,1, MPI_INT, msg_src, msg_tag, + MPI_COMM_WORLD, status); + Broadcast(IO_NODE, (void *) &app_num, 1, MPI_INT, msg_tag); + if (world_rank_ == 0) flushfiles(app_num); + + break; + + + case CREATEFILES: + receive_message((void *)&app_num,1, MPI_INT, msg_src, msg_tag, + MPI_COMM_WORLD, status); + Broadcast(IO_NODE, (void *) &app_num, 1, MPI_INT, msg_tag); + createfiles(app_num); + break; + + default: + printf("Unknown message code %d in proces_io_special\n", msg_code); + break; + } +} + + +void MPIFS::cleanfiles(int app_num) +{ + char buf[64]; + sprintf(buf, "rm -rf %s%d\n", FILEPREFIX, app_num); + if (world_rank_ == 0) system(buf); +} + +void MPIFS::createfiles(int app_num) +{ + char buf[64]; + sprintf(buf, "mkdir %s%d\n", FILEPREFIX, app_num); + if (world_rank_ == 0) system(buf); +} + +void MPIFS::flushfiles(int app_num) +{ + FILE *flushfp; + char filename[64]; + + sprintf(filename, "%s%d/%s.%d", FILEPREFIX, app_num,"flushfile",world_rank_); + if ((flushfp = fopen(filename,"wb+"))==NULL) + { + printf("Cannot open flush file on io node\n"); + exit(1); + }; + +#ifdef TARGETAIX + int size = 4*1024*1024; // on SP2 +#else + int size = 1*1024*1024; // on bunny +#endif + char * buffer = (char*) malloc(sizeof(char) * size); +#ifdef TARGETAIX + for(int i=0; i < 32; i++){ + fwrite(buffer, sizeof(char), size, flushfp); + } +#else + fwrite (buffer, sizeof(char), size, flushfp); +#endif + int filedesc; + filedesc = fileno(flushfp); + fsync(filedesc); + fclose(flushfp); + + if ((flushfp = fopen(filename,"r"))==NULL) + { + printf("Cannot open flush file on io node\n"); + exit(1); + }; + +#ifdef TARGETAIX + for(i=0; i < 32; i++){ + fread(buffer, sizeof(char), size, flushfp); + } +#else + fread(buffer, sizeof(char), size, flushfp); +#endif + + fclose(flushfp); + free(buffer); +} + + +void MPIFS::insert_compute_app(int app_num, App_Info *app) +{ + + if (app_num > current_max_app_num_) + { + if (compute_apps_info_) + compute_apps_info_ = (App_Info **)realloc(compute_apps_info_, + sizeof(App_Info*)*(app_num+1)); + else + compute_apps_info_ = (App_Info**)malloc(sizeof(App_Info*)* + (app_num + 1)); + + for(int i=current_max_app_num_+1; i <= app_num ;i++){ + compute_apps_info_[i] = NULL; + } + current_max_app_num_ = app_num; + } + compute_apps_info_[app_num] = app; + num_apps_++; + num_apps_alive_++; +} + +App_Info* MPIFS::find_compute_app(int app_num) +{ + return compute_apps_info_[app_num]; +} + + +Boolean MPIFS::received_quit_message(int msg_src, int msg_tag, + MPI_Status *status) +{ + int app_num; + receive_message((void *) &app_num, 1, MPI_INT, msg_src, msg_tag, + MPI_COMM_WORLD, status); + Broadcast(IO_NODE, (void *) &app_num, 1, MPI_INT, msg_tag); + num_apps_alive_--; + if (num_apps_alive_ == 0) return YES; + else return NO; +} + +IOFile MPIFS::open_file(char *name, int op_type) +{ + IOFile fp; + + for (int i=0; i<num_open_files_; i++) + if (!strcmp(name, open_file_names_[i])) { + fp = open_file_ptrs_[i]; + free(name); + break; + } + + if (i == num_open_files_) { + char name1[1000], fpfx[100]; + FILE *fp1; + + fp1 = fopen("FILEPREFIX", "r"); + fscanf(fp1, "%s", fpfx); + fclose(fp1); + sprintf(name1, "%s/%s", fpfx, name); + + open_file_names_[num_open_files_] = name; + if ((op_type == RESTART) || (op_type == READ_TIMESTEP)) + fp = open_file_ptrs_[num_open_files_++] = IEEEopen(name1, "r"); + else if ((op_type == TIMESTEP) || (op_type == CHECKPOINT)) + fp = open_file_ptrs_[num_open_files_++] = IEEEopen(name1, "w"); + is_new_file_[i] = YES; + + } + + return fp; +} + +Boolean MPIFS::is_new_file(char *name) { + char name1[100]; + if (node_type() == PART_TIME_COMPUTE || node_type() == COMPUTE_NODE) return; + sprintf(name1, "%s.%d", name, my_rank(IO_NODE)); + + for (int i=0; i<num_open_files_; i++) + if (!strcmp(name1, open_file_names_[i])) { + if (is_new_file_[i] == YES) { + is_new_file_[i] = NO; + return YES; + } else return NO; + } + printf("Can't find the file\n"); + exit(0); + return NO; +} + + +void MPIFS::start_attribute_io(int msg_src, int msg_tag, MPI_Status *status) +{ + char *schema_buf; + int msg_len, i; + IOFile fp = 0; + + mpi_get_count(status, MPI_CHAR, &msg_len); + schema_buf = (char *) malloc(sizeof(char)*msg_len); + receive_message((void *)schema_buf, msg_len, MPI_CHAR, msg_src, + msg_tag, MPI_COMM_WORLD,status); + Broadcast(IO_NODE, (void *)schema_buf, msg_len, MPI_CHAR, msg_tag); + + char *ptr = schema_buf; + union int_to_char tmp; + for (i=0; i<4; i++) tmp.c[i] = *ptr++; + int op_type = tmp.i; + for (i=0; i<4; i++) tmp.c[i] = *ptr++; + int len = tmp.i; + char *fname = (char *)malloc(sizeof(char) * (len + 1)); + char *name1 = (char *)malloc(sizeof(char) * (len + 6)); + for (i=0; i<len; i++) fname[i] = *ptr++; + fname[i] = '\0'; + sprintf(name1, "%s.%d", fname, world_rank_); + fp = open_file(name1, op_type); + Attribute *attr = new Attribute(ptr, op_type); + if (op_type == TIMESTEP) attr->write_data(fp); + else if (op_type == READ_TIMESTEP) { + attr->read_data(fp); + if (am_master_io_node()) send_attr_data(attr); + } + delete attr; + free(schema_buf); +} + +void MPIFS::start_collective_io(int msg_src, int msg_tag, MPI_Status *status) +{ + int *schema_buf, msg_len, comp_app_num = (msg_tag / 10); + Collective_IO *new_io; + IOFile fp = 0; + + mpi_get_count(status, MPI_INT, &msg_len); + schema_buf = (int *) malloc(sizeof(int)*msg_len); + receive_message((void *)schema_buf, msg_len, MPI_INT, msg_src, + msg_tag, MPI_COMM_WORLD,status); + Broadcast(IO_NODE, (void *)schema_buf, msg_len, MPI_INT, msg_tag); + + int len = schema_buf[2]; + char *name = (char *)malloc(sizeof(char) * (len + 1)); + char *name1 = (char *)malloc(sizeof(char) * (len + 6)); + for (int i=0; i<len; i++) name[i] = schema_buf[3+i]; + name[i] = '\0'; + sprintf(name1, "%s.%d", name, world_rank_); + fp = open_file(name1, schema_buf[1]); + free(name); + + switch(schema_buf[0]){ + case SIMPLE_IO: + new_io = new Simple_IO(schema_buf, msg_len,world_rank_, comp_app_num, + compute_apps_info_[comp_app_num]->app_size(), + compute_apps_info_[comp_app_num], fp); + break; + + default: + printf("Error in start_collective_io - undefined strategy\n"); + exit(1); + break; + } + +#ifdef DEBUG + printf("Starting the collective IO for compute app %d\n",comp_app_num); +#endif + new_io->start_to_finish(NO, NULL); +#ifdef DEBUG + printf("Finished the collective I/O for compute app %d\n", comp_app_num); +#endif + release_compute_nodes(comp_app_num); +} + + +void MPIFS::part_time_io_node_loop(int *schema_buf, int msg_len, + Array *array) +{ + Collective_IO *new_io; + IOFile fp = 0; + + int len = schema_buf[2]; + char *name = (char *)malloc(sizeof(char) * (len + 1)); + char *name1 = (char *)malloc(sizeof(char) * (len + 6)); + for (int i=0; i<len; i++) name[i] = schema_buf[3+i]; + name[i] = '\0'; + sprintf(name1, "%s.%d", name, world_rank_); + fp = open_file(name1, schema_buf[1]); + free(name); + + switch(schema_buf[0]){ + case SIMPLE_IO: + new_io = new Simple_IO(schema_buf, msg_len, world_rank_, + 1, app_size_, app_info_, fp); + break; + + default: + printf("Error in part_time_io_node_loop - undefined strategy\n"); + exit(1); + break; + } + new_io->start_to_finish(YES, array); + release_compute_nodes(1); +} + +void MPIFS::compute_node_io_loop(Array *array) +{ + Simple_IO *simple; + + switch(array->io_strategy()){ + case SIMPLE_IO: + simple = new Simple_IO(); + simple->compute_node_io_loop(array); + break; + + default: + printf("Error in MPIFS::compute_node_io_loop - Undefined i/o strategy\n"); + exit(1); + break; + } + compute_side_io_done(); +} + + +int MPIFS::app_size(int node_type) +{ + if (node_type == COMPUTE_NODE) + { + if (node_type_ != IO_NODE) + return app_size_; + else { + printf("Error in MPIFS::app_size - wrong node_type\n"); + exit(10); + } + } + else if (node_type == IO_NODE) + { + if (node_type_ == IO_NODE) return app_size_; + else if (node_type_ == PART_TIME_IO) return io_app_size_; + else { + printf("Error in MPIFS::app_size - wrong node_type\n"); + exit(10); + } + } + else + { + printf("Error in MPIFS::app_size - wrong node_type\n"); + exit(10); + + } + return -1; +} + +int MPIFS::my_rank(int node_type) +{ + if (node_type == COMPUTE_NODE) + { + if (node_type_ != IO_NODE) + return app_rank_; + else { + printf("Error in MPIFS::my_rank - wrong node_type\n"); + exit(10); + } + } + else if (node_type == IO_NODE) + { + if (node_type_ == IO_NODE) return app_rank_; + else if (node_type_ == PART_TIME_IO) return io_app_rank_; + else { + printf("Error in MPIFS::my_rank - wrong node_type\n"); + exit(10); + } + } + else + { + printf("Error in MPIFS::my_rank - wrong node_type\n"); + exit(10); + + } + return -1; +} + +int MPIFS::node_type(){return node_type_;} + +void MPIFS::send_array_schema(Array *array) +{ + int *schema, schema_size; + if (am_master_compute_node()){ + array->pack(&schema, &schema_size); + send_message((void *)schema, schema_size, MPI_INT, master_io_node_, + app_num_*10+ARRAYGROUP_SCHEMA, MPI_COMM_WORLD); + } +} + +void MPIFS::receive_attr_data(Attribute *attr) +{ + int msg_len, i; + MPI_Status status; + char *ptr; + + MPI_Probe(MPI_ANY_SOURCE, ATTRIBUTE_DATA, MPI_COMM_WORLD, &status); + mpi_get_count(&status, MPI_CHAR, &msg_len); + void *data_buf = (void *) malloc(msg_len); + receive_message((void *)data_buf, msg_len, MPI_CHAR, status.MPI_SOURCE, + ATTRIBUTE_DATA, MPI_COMM_WORLD, &status); + Broadcast(COMPUTE_NODE, data_buf, msg_len, MPI_CHAR, ATTRIBUTE_DATA); + + ptr = (char *)data_buf; + union int_to_char tmp; + for (i=0; i<4; i++) tmp.c[i] = *ptr++; + attr->set_esize(tmp.i); + for (i=0; i<4; i++) tmp.c[i] = *ptr++; + attr->set_count(tmp.i); + attr->set_data_ptr(ptr); +} + +void MPIFS::send_attr_data(Attribute *attr) +{ + int i; + void *data_buf = (void *)malloc((attr->data_size() + 8)); + char *ptr = (char *)data_buf; + union int_to_char tmp; + tmp.i = attr->esize(); + for (i=0; i<4; i++) *ptr++ = tmp.c[i]; + tmp.i = attr->count(); + for (i=0; i<4; i++) *ptr++ = tmp.c[i]; + memcpy(ptr, attr->get_data_ptr(), attr->data_size()); + + int master_comp_node = compute_apps_info_[1]->get_master(); + send_message(data_buf, attr->data_size()+8, MPI_CHAR, + master_comp_node, ATTRIBUTE_DATA, MPI_COMM_WORLD); + free(data_buf); +} + +void MPIFS::receive_attr_schema() +{ + char *schema_buf; + int msg_len; + MPI_Status status; + + MPI_Probe(MPI_ANY_SOURCE, ATTRIBUTE_SCHEMA, MPI_COMM_WORLD, &status); + mpi_get_count(&status, MPI_CHAR, &msg_len); + schema_buf = (char *) malloc(sizeof(char) * msg_len); + receive_message((void *)schema_buf, msg_len, MPI_CHAR, status.MPI_SOURCE, + ATTRIBUTE_SCHEMA, MPI_COMM_WORLD, &status); + Broadcast(IO_NODE, (void *)schema_buf, msg_len, MPI_CHAR, ATTRIBUTE_SCHEMA); + free(schema_buf); +} + +void MPIFS::send_attr_schema(Attribute *attr, char *fname, int op_type) +{ + char *schema; + int schema_size; + + if (am_master_compute_node()){ + attr->pack(schema_size, schema, fname, op_type); + send_message((void *)schema, schema_size, MPI_CHAR, master_io_node_, + ATTRIBUTE_SCHEMA, MPI_COMM_WORLD); + } + free(schema); +} + +/* Called from the compute node side */ +void MPIFS::user_commands(int cmd) +{ + if (node_type_ == COMPUTE_NODE) + { + int tag = cmd*10+SPECIAL; + if (am_master_compute_node()){ + send_message((void *)&app_num_, 1, MPI_INT, master_io_node_, + tag, MPI_COMM_WORLD); + } + + if (cmd == APP_BARRIER){ + MPI_Barrier(*(app_info_->intra_comm())); + } else if (cmd == GLOBAL_BARRIER){ + MPI_Barrier(MPI_COMM_WORLD); + } + } + else { + /* Must be the part_time I/O case */ + switch(cmd){ + case APP_BARRIER: + MPI_Barrier(*comm_); + break; + case GLOBAL_BARRIER: + MPI_Barrier(MPI_COMM_WORLD); + break; + case CLEANFILES: + if (node_type_ == PART_TIME_IO) + cleanfiles(app_num_); + break; + case FLUSHFILES: + if (node_type_ == PART_TIME_IO) + flushfiles(app_num_); + break; + case CREATEFILES: + if (node_type_ == PART_TIME_IO) + createfiles(app_num_); + break; + default: + printf("Unknown message code %d\n", cmd); + break; + } + } +} + +/* This function is called by io nodes or part-time io nodes */ +void MPIFS::release_compute_nodes(int app_num) +{ + App_Info *app; + + if (node_type_ == PART_TIME_IO){ + MPI_Barrier(MPI_COMM_WORLD); + } else if ((node_type_ == IO_NODE) && (num_apps_ == 1)){ + MPI_Barrier(*comm_); + if (am_master_io_node()){ + app = find_compute_app(app_num); + send_message(&app_num, 1, MPI_INT, app->get_master(), COMP_QUIT, + MPI_COMM_WORLD); + } + } else { + printf("Error in release_compute_nodes - wrong node type \n"); + exit(11); + } +} + +/* This function is called by the compute nodes after they have done their part */ +void MPIFS::compute_side_io_done() +{ + int app_num; + MPI_Status status; + + if (node_type_ == PART_TIME_COMPUTE){ + MPI_Barrier(MPI_COMM_WORLD); + } else if (node_type_ == COMPUTE_NODE){ + if (am_master_compute_node()) + receive_message((void *)&app_num, 1, MPI_INT, master_io_node_, COMP_QUIT, + MPI_COMM_WORLD, &status); + MPI_Barrier(*comm_); + } else { + printf("Error in compute_side_io_doen - wrong node type\n"); + exit(11); + } +} + +App_Info* MPIFS::io_app_info(){ + return io_app_info_; +} + +int MPIFS::master_io_node(){ + return master_io_node_; +} + +int MPIFS::mem_buf_size() +{ + return mem_buf_size_; +} + +char *MPIFS::mem_buf() +{ + return mem_buf_; +} + +void MPIFS::set_mem_buf_size(int size) +{ + mem_buf_size_ = size; +} + +void MPIFS::set_mem_buf(char *buf) +{ + mem_buf_ = buf; +} diff --git a/src/Panda/MPIFS.h b/src/Panda/MPIFS.h new file mode 100644 index 0000000..4fbedfe --- /dev/null +++ b/src/Panda/MPIFS.h @@ -0,0 +1,95 @@ +#ifndef MPIFS_dot_h +#define MPIFS_dot_h + +#include "definitions.h" +#include "VirtFS.h" +#include "mpi.h" +#include "App_Info.h" +#include "List.h" +#include "Attribute.h" + + +class Collective_IO; +class Array; + + +class MPIFS : public VirtFS { + int node_type_; /* compute,io,part_time .. */ + int world_rank_; /* rank in MPI_COMM_WORLD */ + int app_num_; /* for io-nodes this should be 0 */ + int app_rank_; /* rank within the applications */ + int app_size_; /* size of the application */ + int master_io_node_; + MPI_Comm *comm_; + App_Info *app_info_; /* rank --> world mapping */ + + /* Information used by the IO nodes */ + int num_apps_; /* # of compute apps */ + int num_apps_alive_; + int current_max_app_num_; + int global_barrier_count_; + App_Info **compute_apps_info_; + char *mem_buf_; + int mem_buf_size_; + int num_open_files_; + IOFile open_file_ptrs_[1000]; + char *open_file_names_[1000]; + Boolean is_new_file_[1000]; + + /* Information required for part-time nodes */ + int io_app_num_; + int io_app_rank_; + int io_app_size_; + MPI_Comm *io_comm_; + App_Info *io_app_info_; + + void do_init(int,int,int,int,int*); + void do_init(int,int,int,int*, int,int,int*); + void wait_for_next_message(int*,int*,int*, MPI_Status*); + void process_io_special_message(int,int, MPI_Status*); + void cleanfiles(int); + void createfiles(int); + void flushfiles(int); + void insert_compute_app(int , App_Info*); + App_Info* find_compute_app(int); + Boolean received_quit_message(int,int,MPI_Status*); + + + public: + MPIFS(int,int,int, int,int*); + MPIFS(int,int,int,int,int*, Boolean); + MPIFS(int,int,int,int*,int,int,int*); + virtual ~MPIFS(); + Boolean am_master_compute_node(); + Boolean am_compute_node(); + Boolean am_master_io_node(); + Boolean am_io_node(); + void Broadcast(int,void*, int,MPI_Datatype,int); + void io_node_main_loop(); + void start_collective_io(int,int, MPI_Status*); + void start_attribute_io(int,int,MPI_Status*); + void part_time_io_node_loop(int*,int, Array*); + void compute_node_io_loop(Array*); + + int app_size(int); + int my_rank(int); + void send_array_schema(Array*); + void send_attr_schema(Attribute*, char*, int); + void receive_attr_schema(); + void send_attr_data(Attribute *); + void receive_attr_data(Attribute *); + int node_type(); + void user_commands(int); + void release_compute_nodes(int); + void compute_side_io_done(); + App_Info* io_app_info(); + int master_io_node(); + int mem_buf_size(); + void set_mem_buf_size(int); + char* mem_buf(); + void set_mem_buf(char *); + IOFile open_file(char *, int); + Boolean is_new_file(char *); +}; + +#endif diff --git a/src/Panda/Panda.C b/src/Panda/Panda.C new file mode 100644 index 0000000..6b933e6 --- /dev/null +++ b/src/Panda/Panda.C @@ -0,0 +1,153 @@ +#include "definitions.h" +#include "Panda.h" + + +/* Currently the sequential case is unsupported */ + +extern MPIFS *MPIFS_global_obj; +int global_system_type_; + +Panda::Panda() +{ + file_system_type_ = UNIX_SYSTEM; + file_system_ = NULL; +} + +/* This is the interface for regular Panda (i.e no part-time io nodes) */ +Panda::Panda(int node_type, int app_num , int relative_rank,int app_size, + int* world_ranks) +{ + global_system_type_ = file_system_type_ = MPI_SYSTEM; + file_system_ = new MPIFS(node_type, app_num, relative_rank, app_size, + world_ranks); +} + +/* This is the interface for regular Panda (i.e no part-time io nodes) */ +Panda::Panda(int node_type, int app_num , int relative_rank,int app_size, + int* world_ranks, Boolean shared_flag) +{ + global_system_type_ = file_system_type_ = MPI_SYSTEM; + file_system_ = new MPIFS(node_type, app_num, relative_rank, app_size, + world_ranks, shared_flag); +} + +/* This is the interface for part-time io nodes */ +Panda::Panda(int node_type, int comp_rank, int comp_size, int *comp_world_ranks, + int io_rank, int io_size, int *io_world_ranks) +{ + global_system_type_ = file_system_type_ = MPI_SYSTEM; + file_system_ = new MPIFS(node_type, comp_rank, comp_size, comp_world_ranks, + io_rank, io_size, io_world_ranks); +} + +/* This is the simplest interface for initializing the I/O and compute nodes. + * It can be used only for regular and shread i/o (i.e it cannot be used for + * part-time i/o nodes. The constructor assumes that MPIRUN library has been + * installed and you have distinct applications at the mpirun level + */ + +/* +Panda::Panda(int node_type) +{ + int app_size, app_rank, *world_ranks, leader; + + file_system_type_ = MPI_SYSTEM; + if ((node_type == COMPUTE_NODE) || (node_type == IO_NODE)){ + MPI_Comm_size(MPIRUN_APP_COMM, &app_size); + MPI_Comm_rank(MPIRUN_APP_COMM, &app_rank); + leader = MPIRUN_APP_LEADERS[MPIRUN_APP_ID]; + world_ranks = (int *) malloc(sizeof(int)*app_size); + for(int i=0; i<app_size;i++) + world_ranks[i] = leader + i; + file_system_ = new MPIFS(node_type, MPIRUN_APP_ID, app_rank, app_size, + world_ranks); + free(world_ranks); + world_ranks = NULL; + } else { + printf("Error: Invalid constructor for this node_type %d\n", node_type); + exit(1); + } +} +*/ + +/* This is the simplest interface for the part-time i/o nodes. Here the number + * of i/o nodes is specified. The first <num_io_nodes> are designated as part + * time i/o nodes and the remaining as part-time compute. This requires mpirun + * library to be initialized and there should be only one mpirun application. + */ +/* +Panda::Panda(int node_type, int num_io_nodes) +{ + int app_rank, app_size, *io_ranks, *world_ranks; + + file_system_type_ = MPI_SYSTEM; + if ((node_type == COMPUTE_NODE) || (node_type == IO_NODE)){ + printf("Error: Invalid constructor\n"); + exit(1); + } else if (MPIRUN_NUM_APPS == 1){ + MPI_Comm_size(MPIRUN_APP_COMM, &app_size); + MPI_Comm_rank(MPIRUN_APP_COMM, &app_rank); + world_ranks = (int *)malloc(sizeof(int)*app_size); + io_ranks = (int *) malloc(sizeof(int)*num_io_nodes); + for(int i=0;i<app_size; i++) world_ranks[i] = i; + for(i=0;i<num_io_nodes;i++) io_ranks[i] =i; + if (app_rank < num_io_nodes){ + file_system_ = new MPIFS(PART_TIME_IO, app_rank, app_size, world_ranks, + app_rank, num_io_nodes, io_ranks); + } else { + file_system_ = new MPIFS(PART_TIME_COMPUTE, app_rank, app_size, world_ranks, + -1, num_io_nodes, io_ranks); + } + } else { + printf("Error: Part-time I/O nodes - More than one mpirun app running\n"); + exit(1); + } +} +*/ + +Panda::~Panda() +{ + if (file_system_) delete file_system_; + file_system_ = NULL; + +} + +void Panda::global_barrier() +{ + if (file_system_type_ == MPI_SYSTEM) + { + ((MPIFS *) file_system_)->user_commands(GLOBAL_BARRIER); + } +} + +void Panda::app_barrier() +{ + if (file_system_type_ == MPI_SYSTEM) + { + ((MPIFS *) file_system_)->user_commands(APP_BARRIER); + } +} + +void Panda::flushfiles() +{ + if (file_system_type_ == MPI_SYSTEM) + { + ((MPIFS *) file_system_)->user_commands(FLUSHFILES); + } +} + +void Panda::cleanfiles() +{ + if (file_system_type_ == MPI_SYSTEM) + { + ((MPIFS *) file_system_)->user_commands(CLEANFILES); + } +} + +void Panda::createfiles() +{ + if (file_system_type_ == MPI_SYSTEM) + { + ((MPIFS *) file_system_)->user_commands(CREATEFILES); + } +} diff --git a/src/Panda/Panda.h b/src/Panda/Panda.h new file mode 100644 index 0000000..c2515b3 --- /dev/null +++ b/src/Panda/Panda.h @@ -0,0 +1,31 @@ +#ifndef Panda_dot_h +#define Panda_dot_h + +#include "VirtFS.h" +#include "MPIFS.h" + + +class Panda { + int file_system_type_; + VirtFS *file_system_; + +public: + Panda(); + Panda(int, int, int,int, int*); + Panda(int, int, int,int, int*, Boolean); + Panda(int,int,int,int*,int,int,int*); + Panda(int); + Panda(int, int); + ~Panda(); + + /* stuff required only for testing purposes */ + void global_barrier(); + void app_barrier(); + void cleanfiles(); + void flushfiles(); + void createfiles(); +}; + +#endif + + diff --git a/src/Panda/Shared_IO.C b/src/Panda/Shared_IO.C new file mode 100644 index 0000000..8b5a5cd --- /dev/null +++ b/src/Panda/Shared_IO.C @@ -0,0 +1,237 @@ +#include "definitions.h" +#include "ArrayGroup.h" +#include "MPIFS.h" +#include "Chunk.h" +#include "App_Info.h" +#include "Array.h" +#include "message.h" +#include "Shared_IO.h" + + +extern MPIFS* MPIFS_global_obj; +extern int SUBCHUNK_SIZE; + +Shared_IO::Shared_IO(int *schema_string, int schema_size, int world_rank, + int comp_app_num,int comp_app_size , App_Info *app_info) +: Simple_IO(schema_string, schema_size, world_rank, comp_app_num, + comp_app_size, app_info) +{ + + compute_chunk_ = new Chunk(); + current_chunk_ = new Chunk(); + subchunk_ = new Chunk(); + current_array_id_ = -1; + if ((op_type_ == RESTART)||(op_type_ == GENERAL_READ)|| + (op_type_ == READ_TIMESTEP)) + read_op_ = YES; + else + read_op_ = NO; + + /* We need to set the following variables so that continue_io()* + * would start the I/O of the first subchunk automatically */ + contiguous_ = NO; + current_array_id_ = -1; + current_chunk_id_ = 0; + num_of_chunks_ = -1; /* This will cause get_next_chunk() to fail */ + current_subchunk_id_ = 0; + num_of_subchunks_ = -1; /* Causes get_next_subchunk() to fail */ + status_flag_ = START; + continue_io(); +} + +Shared_IO::~Shared_IO() +{ + if (subchunk_) delete subchunk_; + if (compute_chunk_) delete compute_chunk_; + subchunk_ = compute_chunk_ = NULL; +} + +Boolean Shared_IO::get_next_array(){ + current_array_id_++; + if (current_array_id_ < num_of_arrays_){ + make_subchunks_ = -1; + current_array_ = find_array(current_array_id_); + nat_chunked_ = current_array_->nat_chunked(); + sub_chunked_ = current_array_->sub_chunked(); + array_rank_ = current_array_->rank(); + + if (array_rank_ > max_rank_){ + realloc_schema_bufs(array_rank_); + } + num_of_chunks_ = current_array_->layout(IO_NODE)->total_elements(); + current_chunk_id_ = -1; + if (nat_chunked_ && !sub_chunked_) + contiguous_ = YES; /* No need to use derived datatypes */ + else + contiguous_ = NO; /* Have to use derived datatypes */ + + bytes_to_go_ = 0; + current_subchunk_id_ = -1; + return YES; + } else + return NO; +} + + +Boolean Shared_IO::get_next_chunk() +{ + int *ptr; + + if (!current_array_) return NO; + current_chunk_id_ = current_array_->get_next_index(current_chunk_id_, + my_io_rank_, + num_io_nodes_); + if (current_chunk_id_ < num_of_chunks_){ + current_chunk_->set_data_ptr(NULL); + current_chunk_->init(current_array_, current_chunk_id_, + IO_NODE, NO_ALLOC); + if (contiguous_){ + bytes_to_go_ = current_chunk_->total_size_in_bytes(); + current_chunk_->set_data_ptr(mem_buf_); + ptr = schema_bufs_[0]; + *ptr++ = current_array_id_; + *ptr++ = current_chunk_id_; + *ptr++ = (int) nat_chunked_; + *ptr++ = (int) contiguous_; + *ptr++ = op_type_; + *ptr++ = 0; + *ptr++ = 0; + compute_chunk_overlaps(current_array_, current_chunk_); + } + else { + if (!sub_chunked_ && (make_subchunks_ == -1)){ + current_array_->make_sub_chunks(current_chunk_); + make_subchunks_ = 1; + } + num_of_subchunks_ = current_array_->layout(SUB_CHUNK)->total_elements(); + current_subchunk_id_ = -1; + } + return YES; + } + else + return NO; +} + + +/* This should not be called for the contiguous_ case */ +Boolean Shared_IO::get_next_subchunk() +{ + current_subchunk_id_++; + if (current_subchunk_id_ < num_of_subchunks_){ + subchunk_->set_data_ptr(NULL); + subchunk_->init(current_chunk_, current_subchunk_id_, NO_ALLOC); + bytes_to_go_ = subchunk_->total_size_in_bytes(); + + if (bytes_to_go_ < mem_buf_size_) + realloc_mem_bufs(bytes_to_go_); + + subchunk_->set_data_ptr(mem_buf_); + return YES; + } + else + return NO; +} + + +void Shared_IO::start_subchunk_io() +{ + int *ptr; + + if (contiguous_){ + ptr = schema_bufs_[0]; + ptr[6] = min(SUBCHUNK_SIZE, bytes_to_go_); + + nb_send_message((void *)ptr, 7, MPI_INT, dest_ids_[0], + CHUNK_SCHEMA, MPI_COMM_WORLD, &schema_requests_[0]); + if (read_op_){ + read_data(mem_buf_, ptr[6]); + nb_send_message((void *)mem_buf_, ptr[6], MPI_CHAR, dest_ids_[0], + CHUNK_DATA_FROM_IO, MPI_COMM_WORLD, &requests_[0]); + } + else + nb_receive_message((void *)mem_buf_, ptr[6], MPI_CHAR, dest_ids_[0], + CHUNK_DATA_TO_IO, MPI_COMM_WORLD, &requests_[0]); + ptr[5] += ptr[6]; /* Offset of the next subchunk */ + bytes_to_go_ -= ptr[6]; + status_flag_ = WAITING; + + } else { + compute_chunk_overlaps(current_array_, subchunk_); + + compute_schemas(current_array_, subchunk_, compute_chunk_, + current_array_id_); + if (read_op_){ + read_data(subchunk_); + send_data_to_compute_nodes(subchunk_, NULL, NULL); + } + else + receive_data_from_compute_nodes(subchunk_, NULL, NULL); + status_flag_ = WAITING; + } +} + + +Boolean Shared_IO::test_subchunk_io() +{ + int flag; + MPI_Testall(num_overlaps_, requests_, &flag, statuses_); + if (flag) { + /* Free schema request objects - Do we need to do this */ + MPI_Waitall(num_overlaps_, schema_requests_,statuses_); + status_flag_ = START; + if (!read_op_) + if (contiguous_) + write_data(mem_buf_, schema_bufs_[0][6], 1); + else + write_data(subchunk_); + + if (!contiguous_) free_datatypes(); + return YES; + } + return NO; +} + + +/* Return YES, if I/O is complete */ +Boolean Shared_IO::continue_io() +{ + if (status_flag_ == START){ + if (!start_next_subchunk_io()) return YES; /* IO completed */ + } else if (status_flag_ == WAITING){ + if (test_subchunk_io()) + if (!start_next_subchunk_io()) return YES; /* IO done */ + } else { + printf("Error - Invalid status_flag value \n"); + exit(11); + } + return NO; +} + +/* Return yes if you can start the io of another subchunk */ +Boolean Shared_IO::start_next_subchunk_io() +{ + if (contiguous_){ + if (bytes_to_go_ <= 0){ + while(!get_next_chunk()){ + if (!get_next_array()) return NO; + } + /* Since we might be looking at another array */ + if (!contiguous_) get_next_subchunk(); + } + + start_subchunk_io(); + } else { + + if (!get_next_subchunk()){ + /* We have finished this chunk */ + while(!get_next_chunk()){ + if (!get_next_array()) return NO; + } + if (!contiguous_) get_next_subchunk(); + } + + start_subchunk_io(); + } + return YES; +} + diff --git a/src/Panda/Shared_IO.h b/src/Panda/Shared_IO.h new file mode 100644 index 0000000..8f29eca --- /dev/null +++ b/src/Panda/Shared_IO.h @@ -0,0 +1,32 @@ +#ifndef Shared_IO_dot_h +#define Shared_IO_dot_h + +#include "Simple_IO.h" +class Chunk; + +class Shared_IO : public Simple_IO +{ + protected: + int current_array_id_; + int status_flag_; + Chunk *subchunk_; + Chunk *compute_chunk_; + Boolean read_op_; + int bytes_to_go_; + int make_subchunks_; + + Boolean get_next_chunk(); + Boolean get_next_array(); + Boolean get_next_subchunk(); + Boolean start_next_subchunk_io(); + void start_subchunk_io(); + Boolean test_subchunk_io(); + + public: + Shared_IO(int*,int,int,int,int, App_Info*); + virtual ~Shared_IO(); + virtual Boolean continue_io(); +}; + +#endif + diff --git a/src/Panda/Simple_IO.C b/src/Panda/Simple_IO.C new file mode 100644 index 0000000..a3c7bc0 --- /dev/null +++ b/src/Panda/Simple_IO.C @@ -0,0 +1,846 @@ +#include "definitions.h" +#include "MPIFS.h" +#include "Chunk.h" +#include "App_Info.h" +#include "Simple_IO.h" +#include "Array.h" +#include "message.h" + +#include "external/IEEEIO/src/Arch.h" +#include "external/IEEEIO/src/IOProtos.h" + +extern MPIFS* MPIFS_global_obj; +extern int SUBCHUNK_SIZE; + +extern "C" { + int IOreserveChunk(IOFile,int,int,int*); + int IOwriteStream(IOFile,void*,int); + int IOreadStream(IOFile,void*,int); + int IOwriteAttribute(IOFile, char *, int, int, void *); + int IOreadInfo(IOFile,int *,int *,int *,int); + int IOreadAttributeInfo(IOFile, char *,int *, int *); + int IOreadAttribute(IOFile,int,void*); +} + +/* This constructor is needed by the compute node to create a dummy object. + * The dummy object is needed so that the compute node can execute the + * specialized compute node io loop + */ +Simple_IO::Simple_IO() +{ + dummy_ = YES; + schema_string_ = current_schema_ptr_ = NULL; + current_array_ =NULL; + current_chunk_ = NULL; + num_io_nodes_ = -1; + my_io_rank_ = -1; + compute_app_num_ = -1; + app_info_ = NULL; + part_time_io_ = NO; + compute_node_array_ =NULL; + overlap_chunk_ids_ = dest_ids_ = NULL; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank_); + schema_requests_ = NULL; + requests_ =NULL; + statuses_ =NULL; + datatypes_ = NULL; + schema_bufs_ = NULL; + data_ptrs_ = NULL; + overlap_base_ = overlap_size_ =overlap_stride_ =NULL; + mem_buf_ = NULL; +} + +Simple_IO::Simple_IO(int *schema_string, int schema_size, int world_rank, + int comp_app_num, int comp_app_size , App_Info *app_info, + IOFile fp) +{ + int schema_buf_size; + + dummy_ = NO; + schema_string_ = schema_string; + schema_size_ = schema_size; + current_schema_ptr_ = schema_string; + num_io_nodes_ = MPIFS_global_obj->app_size(IO_NODE); + my_io_rank_ = MPIFS_global_obj->my_rank(IO_NODE); + compute_app_num_ = comp_app_num; + app_info_ = app_info; + world_rank_ = world_rank; + + num_overlaps_ = 0; + max_overlaps_ = comp_app_size; + overlap_chunk_ids_ = (int *) malloc(sizeof(int)*max_overlaps_); + dest_ids_ = (int *) malloc(sizeof(int)*max_overlaps_); + schema_bufs_ = (int **) malloc(sizeof(int *) *max_overlaps_); + requests_ = (MPI_Request*)malloc(sizeof(MPI_Request)*max_overlaps_); + schema_requests_ = (MPI_Request*)malloc(sizeof(MPI_Request)*max_overlaps_); + statuses_ = (MPI_Status*) malloc(sizeof(MPI_Status)*max_overlaps_); + datatypes_ = (MPI_Datatype*)malloc(sizeof(MPI_Datatype)*max_overlaps_); + max_rank_ = 10; + overlap_base_ = (int *) malloc(sizeof(int)*max_rank_); + overlap_stride_ = (int *) malloc(sizeof(int)*max_rank_); + overlap_size_ = (int *) malloc(sizeof(int)*max_rank_); + data_ptrs_ = (char **) malloc(sizeof(char*)*max_overlaps_); + part_time_io_ = NO; + compute_node_array_ = NULL; + mem_buf_size_ = MPIFS_global_obj->mem_buf_size(); + mem_buf_ = MPIFS_global_obj->mem_buf(); + + schema_buf_size = 6+ max_rank_*3; + for(int i=0; i < max_overlaps_; i++){ + data_ptrs_[i] = NULL; + schema_bufs_[i] = (int *) malloc(sizeof(int)*schema_buf_size); + } + + current_array_ = new Array(&schema_string); + + current_chunk_ = NULL; + num_of_chunks_ = 0; + num_of_subchunks_ = 0; + current_chunk_id_ = -1; + current_subchunk_id_ = -1; + file_ptr_ = NULL; + schema_file_ptr_ = NULL; + file_ptr_ = fp; +} + +Simple_IO::~Simple_IO() +{ + if (dummy_){ + } else { + + /* This is the object created for the I/O nodes */ + + if (current_array_) delete current_array_; + if (schema_string_) free(schema_string_); + if (overlap_chunk_ids_) free(overlap_chunk_ids_); + if (dest_ids_) free(dest_ids_); + if (requests_) free(requests_); + if (schema_requests_) free(schema_requests_); + if (statuses_) free(statuses_); + if (datatypes_) free(datatypes_); + if (overlap_base_) free(overlap_base_); + if (overlap_size_) free(overlap_size_); + if (overlap_stride_) free(overlap_stride_); + + if (schema_bufs_){ + for(int i=0;i < max_overlaps_; i++){ + if (schema_bufs_[i]) free(schema_bufs_[i]); + schema_bufs_[i] = NULL; + } + free(schema_bufs_); + } + + if (data_ptrs_) free(data_ptrs_); + + schema_bufs_ = NULL; + data_ptrs_ = NULL; + overlap_base_ = overlap_size_ = overlap_stride_ = NULL; + overlap_chunk_ids_ = dest_ids_ = NULL; + requests_ = NULL; + schema_requests_ = NULL; + statuses_ = NULL; + datatypes_ =NULL; + schema_string_ = NULL; + } +} + +void Simple_IO::realloc_buffers(int new_size) +{ + int schema_buf_size = 6+max_rank_*3; + + + overlap_chunk_ids_=(int *) realloc(overlap_chunk_ids_, new_size*sizeof(int)); + schema_bufs_ = (int **) realloc(schema_bufs_, new_size*sizeof(int*)); + dest_ids_ = (int *) realloc(overlap_chunk_ids_, new_size*sizeof(int)); + requests_ = (MPI_Request*)realloc(requests_, new_size*sizeof(MPI_Request)); + schema_requests_ = (MPI_Request*)realloc(schema_requests_, + new_size*sizeof(MPI_Request)); + statuses_ = (MPI_Status*)realloc(statuses_, new_size*sizeof(MPI_Status)); + datatypes_ = (MPI_Datatype*)realloc(datatypes_, + new_size*sizeof(MPI_Datatype)); + data_ptrs_ = (char **) realloc(data_ptrs_, new_size*sizeof(char*)); + for(int i=max_overlaps_;i<new_size;i++){ + schema_bufs_[i] = (int *)malloc(sizeof(int)*schema_buf_size); + data_ptrs_[i] = NULL; + } + max_overlaps_ = new_size; +} + +/* This is called only for the following cases * + * - natural chunking with user-specified subchunking * + * - reorganization (with or without user-specified chunking) */ +void Simple_IO::compute_chunk_overlaps(Array *array, Chunk *subchunk) +{ + int num_compute_chunks; + + if (nat_chunked_){ + num_overlaps_ = 1; + overlap_chunk_ids_[0] = current_chunk_id_; + } + else{ + num_compute_chunks = array->layout(COMPUTE_NODE)->total_elements(); + if (num_compute_chunks > max_overlaps_) realloc_buffers(num_compute_chunks); + subchunk->chunk_overlaps(array, &num_overlaps_, + overlap_chunk_ids_, COMPUTE_NODE); + } + + for(int i=0; i < num_overlaps_;i++) { + dest_ids_[i]=app_info_->world_rank(array->which_node(overlap_chunk_ids_[i], + COMPUTE_NODE)); +} + +#ifdef DEBUG + printf("For subchunk_id %d of chunk %d\n", current_subchunk_id_, + current_chunk_id_); + printf("The overlapping compute chunk ids are \n"); + for(int k =0; k < num_overlaps_; k++) printf("%d ", overlap_chunk_ids_[k]); + printf("\n"); +#endif +} + + +/* This is called only for the following cases * + * - natural chunking with user-specified subchunking * + * - reorganization (with or without user-specified chunking) */ +void Simple_IO::compute_schemas(Array *array, Chunk *subchunk , + Chunk *compute_chunk) +{ + if (nat_chunked_ && !contiguous_ && !overlaped_){ + subchunk->copy_base_size_stride(overlap_base_, overlap_size_, + overlap_stride_); + send_schema_message(0); + make_datatype(subchunk, 0); + } + else if (!nat_chunked_) { + for (int i=0; i< num_overlaps_; i++){ + compute_chunk->init(array, overlap_chunk_ids_[i], COMPUTE_NODE, NO_ALLOC); + subchunk->compute_overlap(compute_chunk, overlap_base_, overlap_size_, + overlap_stride_); + send_schema_message(i); + make_datatype(subchunk, i); + } + } else { + printf("Error - In Simple_IO::compute_schemas\n"); + exit(1); + } +} + + +/* The chunk_id is in overlap_chunk_ids_[index], the dest is in * + * in dest_ids_[index]. The rank,base,stride and size info is in * + * overlap_base, overlap_size, overlap_stride, array_rank_ */ +void Simple_IO::send_schema_message(int index) +{ + int *ptr = schema_bufs_[index]; + int schema_size = 5+array_rank_*3; + + *ptr++ = overlap_chunk_ids_[index]; + *ptr++ = (int) nat_chunked_; + *ptr++ = (int) contiguous_; + *ptr++ = array_rank_; + *ptr++ = op_type_; + + for(int i=0; i < array_rank_; i++) *ptr++ = overlap_base_[i]; + for(i=0; i < array_rank_; i++) *ptr++ = overlap_size_[i]; + for(i=0; i < array_rank_; i++) *ptr++ = overlap_stride_[i]; + + if (part_time_io_ && (dest_ids_[index] == world_rank_)) + /* No need to send the message */ + schema_requests_[index] = MPI_REQUEST_NULL; + else + nb_send_message((void *)schema_bufs_[index], schema_size, MPI_INT, + dest_ids_[index], index*10+CHUNK_SCHEMA, MPI_COMM_WORLD, + &schema_requests_[index]); +} + +/* The overlap base, size, stride are in overlap_base, overlap_size, * + * and overlap_stride */ +void Simple_IO::make_datatype(Chunk *subchunk, int index) +{ + void *ptr; + subchunk->make_datatype(overlap_base_, overlap_size_, overlap_stride_, + &ptr, &datatypes_[index]); + data_ptrs_[index] = (char *) ptr; +} + +/* Again this function is called only for the following cases * + * - natural chunking with user-specified subchunking * + * - re-organization with/without user-specified chunking * + * The case of natural chunking (with no user-specified * + * subchunking) is handled seperately */ + +void Simple_IO::receive_data(Chunk *subchunk, int index, int &array_bytes_to_go) +{ + + if (part_time_io_ && (dest_ids_[index] == world_rank_)){ + /* Perform a mem copy of the required chunk */ + copy_data(subchunk, index, NO, array_bytes_to_go); + requests_[index] = MPI_REQUEST_NULL; + } else + nb_receive_message((void *)data_ptrs_[index], 1, datatypes_[index], + dest_ids_[index], index*10+CHUNK_DATA_TO_IO, + MPI_COMM_WORLD, &requests_[index]); +} + +/* Again this function is called only for the following cases * + * - natural chunking with user-specified subchunking * + * - re-organization with/without user-specified chunking * + * The case of natural chunking (with no user-specified * + * subchunking) is handled seperately */ +void Simple_IO::send_data(Chunk *subchunk, int index, int &array_bytes_to_go) +{ + if (part_time_io_ && (dest_ids_[index] == world_rank_)){ + /* Perform a memory copy of the required chunk */ + copy_data(subchunk, index, YES, array_bytes_to_go); + requests_[index] =MPI_REQUEST_NULL; + } else { + /* Send the required datatype using a non-blocking send */ + nb_send_message((void *)data_ptrs_[index], 1, datatypes_[index], + dest_ids_[index], index*10+CHUNK_DATA_FROM_IO, + MPI_COMM_WORLD, &requests_[index]); + } +} + +void Simple_IO::read_data(Chunk *subchunk) +{ + int size; + size = subchunk->total_size_in_bytes(); + read_data((char *)(subchunk->data_ptr()), size, subchunk->element_size()); +} + +void Simple_IO::read_data(char *buf, int size, int esize) +{ + int n,bytes_to_go=size,buf_size; + char *tmp_buf = buf; + + while(bytes_to_go > 0){ + buf_size = min(bytes_to_go, SUBCHUNK_SIZE); + n = IOreadStream(file_ptr_, (void *)tmp_buf, buf_size/esize); + if (n != buf_size){ + printf("Error reading data - write only %d instead of %d bytes\n", + n, buf_size); +// exit(1); + } + bytes_to_go -= buf_size; + tmp_buf += buf_size; + } +} + + +void Simple_IO::write_data(char *buf, int size, int esize) +{ + int n, bytes_to_go = size, buf_size; + char *tmp_buf = buf; + + while(bytes_to_go > 0){ + buf_size = min(bytes_to_go, SUBCHUNK_SIZE); + n = IOwriteStream(file_ptr_, (void *)tmp_buf, buf_size/esize); + if (n != buf_size){ + printf("Error writing data - write only %d instead of %d bytes\n", + n, buf_size); + exit(1); + } + tmp_buf += buf_size; + bytes_to_go -= buf_size; + } +} + +void Simple_IO::write_data(Chunk* subchunk) +{ + int size; + size = subchunk->total_size_in_bytes(); + write_data((char *)(subchunk->data_ptr()), size, subchunk->element_size()); +} + +void Simple_IO::free_datatypes() +{ + for(int i=0; i <num_overlaps_; i++) MPI_Type_free(&datatypes_[i]); +} + +void Simple_IO::send_data_to_compute_nodes(Chunk *subchunk, + int &array_bytes_to_go) +{ + for(int i=0; i< num_overlaps_; i++) + send_data(subchunk, i, array_bytes_to_go); +} + +void Simple_IO::receive_data_from_compute_nodes(Chunk *subchunk, + int &array_bytes_to_go) +{ + for (int i=0; i< num_overlaps_; i++) + receive_data(subchunk, i, array_bytes_to_go); +} + +void Simple_IO::wait_for_completion(int &array_bytes_to_go, + Array *compute_array) +{ + int flag=0; + + if (part_time_io_){ + /* This is to avoid deadlocks */ + while (!flag){ + MPI_Testall(num_overlaps_, requests_, &flag, statuses_); + if (array_bytes_to_go > 0) + process_compute_message(array_bytes_to_go, compute_array); + } + } else { + MPI_Waitall(num_overlaps_, requests_, statuses_); + } + /* Free the schema request objects - Do we need this*/ + MPI_Waitall(num_overlaps_, schema_requests_, statuses_); +} + +/* For part-io nodes, get the data using memory copy if the * + * data resides on the same node. */ +void Simple_IO::copy_data(Chunk *subchunk, int index, Boolean flag, + int &array_bytes_to_go) +{ + void *comp_data_ptr; + MPI_Datatype comp_datatype; + int position=0, buf_size; + void *buf=NULL; + int *schema = schema_bufs_[index]; + int comp_chunk_id = schema[0]; + int comp_array_rank = schema[3]; + int *base = &schema[5]; + int *size = &schema[5+comp_array_rank*1]; + int *stride = &schema[5+comp_array_rank*2]; + int bytes_copied = num_elements(comp_array_rank, size)* + subchunk->element_size(); + Array *comp_array = compute_node_array_; + Chunk *comp_chunk = comp_array->find_chunk(comp_chunk_id); + comp_chunk->make_datatype(base, size,stride, &comp_data_ptr, + &comp_datatype); + if (array_bytes_to_go > 0) array_bytes_to_go -= bytes_copied; + + if (flag){ + MPI_Pack_size(1, datatypes_[index], MPI_COMM_WORLD, &buf_size); + buf = (void *) malloc(buf_size); + MPI_Pack(data_ptrs_[index], 1, datatypes_[index], buf, buf_size, + &position, MPI_COMM_WORLD); + position =0; + MPI_Unpack(buf, buf_size, &position, comp_data_ptr, 1, comp_datatype, + MPI_COMM_WORLD); + free(buf); + } else { + MPI_Pack_size(1, comp_datatype, MPI_COMM_WORLD, &buf_size); + buf = (void *) malloc(buf_size); + MPI_Pack(comp_data_ptr, 1, comp_datatype, buf, buf_size, + &position, MPI_COMM_WORLD); + position = 0; + MPI_Unpack(buf, buf_size, &position, data_ptrs_[index], 1, + datatypes_[index], MPI_COMM_WORLD); + free(buf); + } + MPI_Type_free(&comp_datatype); +} + +/* For nat chunking with no user defined subchunking, read/write + * data directly from compute chunk (i.e if it is on same node) */ +void Simple_IO::direct_io(int chunk_id, Boolean flag, int &array_bytes_to_go) +{ + Array *comp_array = compute_node_array_; + Chunk *comp_chunk = comp_array->find_chunk(chunk_id); + if (flag) read_data(comp_chunk); + else write_data(comp_chunk); + if (array_bytes_to_go > 0) + array_bytes_to_go -= comp_chunk->total_size_in_bytes(); +} + +void Simple_IO::realloc_schema_bufs(int new_size) +{ + int schema_buf_size = sizeof(int)*(6+new_size*3); + + max_rank_ = new_size; + overlap_base_ = (int *) realloc(overlap_base_, max_rank_*sizeof(int)); + overlap_size_ = (int *) realloc(overlap_size_, max_rank_*sizeof(int)); + overlap_stride_ = (int *) realloc(overlap_stride_, max_rank_*sizeof(int)); + for(int i=0; i < max_overlaps_; i++){ + schema_bufs_[i] = (int *) realloc(schema_bufs_[i], schema_buf_size); + } +} + +void Simple_IO::realloc_mem_bufs(int new_size) +{ + mem_buf_size_ = new_size; + mem_buf_ = (char *) realloc(mem_buf_, sizeof(char)*mem_buf_size_); + MPIFS_global_obj->set_mem_buf_size(new_size); + MPIFS_global_obj->set_mem_buf(mem_buf_); +} + +void Simple_IO::start_to_finish(Boolean part_time, Array *compute_array) +{ + int make_subchunks, bytes_to_go; + int array_bytes_to_go,*ptr; + Boolean read_op; + Chunk *chunk=NULL, *subchunk=NULL, *compute_chunk=NULL, *tmp_chunk; + + op_type_ = current_array_->op_type(); + if ((op_type_ == RESTART)||(op_type_ == GENERAL_READ)|| + (op_type_ == READ_TIMESTEP)) + read_op = YES; + else + read_op = NO; + + part_time_io_ = part_time; + compute_node_array_ = compute_array; + + if (read_op) { + int numbertype, rank, index, datatype, length; + int *dims = (int *)malloc(sizeof(int) * 10); + IOreadInfo(file_ptr_, &numbertype, &rank, dims, 10); + int *size = (int *)malloc(sizeof(int) * rank); + + index = IOreadAttributeInfo(file_ptr_, "global_size", &datatype, &length); + if (index >=0 ) { // the attribute exists + IOreadAttribute(file_ptr_, index, size); + current_array_->init(rank, numbertype, size, IO_NODE); + } else { printf("Error: no attribute, global_size\n"); exit(0); } + free(dims); + +printf("%d: read rank %d, numbertype %d, size (%d %d %d)\n", world_rank_, + rank, numbertype, size[0], size[1], size[2]); + + int schema_size = 2 + rank; + int *schema = (int *)malloc(sizeof(int) * schema_size); + if (MPIFS_global_obj->am_master_io_node()) { + schema[0] = rank; schema[1] = numbertype; + for (int i=0; i<rank; i++) schema[2+i] = size[i]; + send_message((void *)schema, schema_size, MPI_INT, + app_info_->get_master(), + ARRAYGROUP_SCHEMA, MPI_COMM_WORLD); + } + if (part_time_io_) { + MPI_Status status; + receive_message(schema, schema_size, MPI_INT, MPI_ANY_SOURCE, + ARRAYGROUP_SCHEMA, MPI_COMM_WORLD, &status); + MPIFS_global_obj->Broadcast(COMPUTE_NODE, (void *)schema, + schema_size, MPI_INT, ARRAYGROUP_SCHEMA); + + compute_array->init(rank, numbertype, size, COMPUTE_NODE); + } + free(schema); + } + + if (part_time_io_) array_bytes_to_go = compute_node_array_->array_info(); + + /* To reduce costs associated with object creation and deletion, we * + * will create a dummy chunk,subchunk and compute chunk object and * + * re-initialize them whenever necessary. */ + tmp_chunk = chunk = new Chunk(); + current_chunk_ = chunk; + subchunk = new Chunk(); + compute_chunk = new Chunk(); + + make_subchunks = -1; + + nat_chunked_ = current_array_->nat_chunked(); + sub_chunked_ = current_array_->sub_chunked(); + overlaped_ = current_array_->overlaped(); + if (overlaped_) { contiguous_ = NO; nat_chunked_ = NO; } + else { + if (nat_chunked_ && !sub_chunked_) + contiguous_ = YES; /* No need to use derived datatypes */ + else contiguous_ = NO; /* Have to use derived datatypes */ + } + + array_rank_ = current_array_->rank(); + if (array_rank_ > max_rank_) realloc_schema_bufs(array_rank_); + + if (read_op) current_array_->read_schema_file(file_ptr_); + + num_of_chunks_ = current_array_->layout(IO_NODE)->total_elements(); + current_chunk_id_ = current_array_->get_next_index(chunk, -1, my_io_rank_, + num_io_nodes_, + num_of_chunks_); + +#ifdef DEBUG + printf("%d: current_chunk_id_=%d my_io_rank=%d num_io_nodes=%d\n", + world_rank_, current_chunk_id_, my_io_rank_, num_io_nodes_); +#endif + if (contiguous_){ + /* Natural chunked and no user-specified subchunking. Therefore we don't + * need to used mpi-derived datatypes. */ + + while (current_chunk_id_ < num_of_chunks_) { + if (!read_op) { + int *tmp_size = (int *)malloc(sizeof(int) * array_rank_); + for (int cnt = 0; cnt < array_rank_; cnt++) + tmp_size[cnt] = chunk->size()[array_rank_ - cnt - 1]; + IOreserveChunk(file_ptr_, current_array_->ieee_size(), + array_rank_, tmp_size); + //printf("##### called IOreserveChunk for n.c. %d %d %d %d %d\n", current_array_->ieee_size(), array_rank_, tmp_size[0], tmp_size[1], tmp_size[2]); + + free(tmp_size); + if (num_of_chunks_ > 1) { + IOwriteAttribute(file_ptr_,"chunk_origin", INT32, 3, chunk->base()); + IOwriteAttribute(file_ptr_, "chunk_size", INT32, 3, chunk->size()); + } + } + + /* for part-time io case, if chunk resides on same node, perform the * + * read/write operation directly. */ + num_overlaps_ = 1; + overlap_chunk_ids_[0] = current_chunk_id_; + dest_ids_[0] = app_info_->world_rank(current_array_->which_node( + current_chunk_id_, COMPUTE_NODE)); + + if (part_time_io_ && (world_rank_ == dest_ids_[0])){ + direct_io(current_chunk_id_, read_op, array_bytes_to_go); + } else { + bytes_to_go = chunk->total_size_in_bytes(); + chunk->set_data_ptr(mem_buf_); + + /* Make the schema request */ + ptr = schema_bufs_[0]; + *ptr++ = current_chunk_id_; + *ptr++ = (int)nat_chunked_; + *ptr++ = (int)contiguous_; + *ptr++ = op_type_; + *ptr++ = 0; /* This is the offset */ + *ptr++ = 0; /* Size of the data */ + + ptr = schema_bufs_[0]; + while(bytes_to_go > 0){ + ptr[5] = min(SUBCHUNK_SIZE, bytes_to_go); + + nb_send_message((void *)ptr, 6, MPI_INT, dest_ids_[0], + CHUNK_SCHEMA, MPI_COMM_WORLD, &schema_requests_[0]); + if (read_op){ + read_data(mem_buf_, ptr[5], chunk->element_size()); + nb_send_message((void *)mem_buf_, ptr[5], MPI_CHAR, dest_ids_[0], + CHUNK_DATA_FROM_IO, MPI_COMM_WORLD, &requests_[0]); + } else + nb_receive_message((void *)mem_buf_, ptr[5], MPI_CHAR, + dest_ids_[0], CHUNK_DATA_TO_IO, + MPI_COMM_WORLD, &requests_[0]); + /* Have to watch for deadlock over here */ + wait_for_completion(array_bytes_to_go, compute_node_array_); + if (!read_op) write_data(mem_buf_, ptr[5], chunk->element_size()); + ptr[4] += ptr[5]; + bytes_to_go -= ptr[5]; + } + chunk->set_data_ptr(NULL); + } + current_chunk_id_ = current_array_->get_next_index(chunk, + current_chunk_id_, + my_io_rank_, + num_io_nodes_, + num_of_chunks_); + } /* End while */ + } /* End if (contiguous_) */ + else { + /* We have no choice but to use MPI-derived datatypes */ + while(current_chunk_id_ < num_of_chunks_){ + if (!read_op) { + int *tmp_size = (int *)malloc(sizeof(int) * array_rank_); + for (int cnt = 0; cnt < array_rank_; cnt++) + tmp_size[cnt] = chunk->size()[array_rank_ - cnt - 1]; + IOreserveChunk(file_ptr_, current_array_->ieee_size(), + array_rank_, tmp_size); + //printf("##### called IOreserveChunk for r.o. %d %d %d %d %d\n", current_array_->ieee_size(), array_rank_, tmp_size[0], tmp_size[1], tmp_size[2]); + + free(tmp_size); + if (num_of_chunks_ > 1) { + IOwriteAttribute(file_ptr_,"chunk_origin", INT32, 3, chunk->base()); + IOwriteAttribute(file_ptr_, "chunk_size", INT32, 3, chunk->size()); + } + } + + /* If the array is not subchunked, then subchunk the array into * + * SUBCHUNK_SIZE chunks. This is to reduce the size of the * + * messages and the memory requirements. The current version makes a * + * dumb assumption, that if the user specifies the subchunks, * + * then the size of those subchunks is less than SUBCHUNK_SIZE. * + * It's a dumb assumption and needs to be fixed. */ + + if (!sub_chunked_ && (make_subchunks == -1)){ + current_array_->make_sub_chunks(chunk); + make_subchunks = 1; + } + num_of_subchunks_ =current_array_->layout(SUB_CHUNK)->total_elements(); + + for (current_subchunk_id_=0; current_subchunk_id_ < num_of_subchunks_; + current_subchunk_id_++){ + subchunk->init(chunk, current_subchunk_id_, NO_ALLOC); + bytes_to_go = subchunk->total_size_in_bytes(); + + if (bytes_to_go > mem_buf_size_) realloc_mem_bufs(bytes_to_go); + subchunk->set_data_ptr(mem_buf_); + + compute_chunk_overlaps(current_array_, subchunk); + compute_schemas(current_array_, subchunk, compute_chunk); + + if (read_op){ + read_data(subchunk); + send_data_to_compute_nodes(subchunk, array_bytes_to_go); + } else receive_data_from_compute_nodes(subchunk, array_bytes_to_go); + wait_for_completion(array_bytes_to_go, compute_node_array_); + if (!read_op) write_data(subchunk); + + free_datatypes(); + subchunk->set_data_ptr(NULL); + } + current_chunk_id_ = current_array_->get_next_index(chunk, + current_chunk_id_, + my_io_rank_, + num_io_nodes_, + num_of_chunks_); + } /* End while loop */ + } /* End if else */ + +#ifdef DEBUG + printf("%d:Finished the I/O\n", world_rank_); +#endif + if (part_time_io_){ + /* Since the I/O side is finished jump into the compute loop */ + while (array_bytes_to_go > 0) + process_compute_message(array_bytes_to_go, compute_node_array_); +#ifdef DEBUG + printf("%d:Finished the compute side of the part-time io\n", world_rank_); +#endif + } + + /* Delete chunk, subchunk, compute_chunk */ + if (tmp_chunk) delete tmp_chunk; + if (subchunk) delete subchunk; + if (compute_chunk) delete compute_chunk; + chunk=subchunk=compute_chunk=NULL; +} + +/* This function should not access any of the instance variables of + * the Simple_IO object without setting them first + */ +void Simple_IO::compute_node_io_loop(Array *array) +{ + int op_type = array->op_type(); + if ((op_type == RESTART) || (op_type == GENERAL_READ) || + (op_type == READ_TIMESTEP)) { + MPI_Status status; + int *schema, schema_size; + + MPI_Probe(MPI_ANY_SOURCE, ARRAYGROUP_SCHEMA, MPI_COMM_WORLD, &status); + mpi_get_count(&status, MPI_INT, &schema_size); + schema = (int *)malloc(sizeof(int) * schema_size); + receive_message((void *)schema, schema_size, MPI_INT, status.MPI_SOURCE, + ARRAYGROUP_SCHEMA, MPI_COMM_WORLD, &status); + MPIFS_global_obj->Broadcast(COMPUTE_NODE, (void *)schema, + schema_size, MPI_INT, ARRAYGROUP_SCHEMA); + + int *size = (int *)malloc(sizeof(int) * schema[0]); + for (int i=0; i<schema[0]; i++) size[i] = schema[2+i]; +printf("%d: read rank %d, numbertype %d, size (%d %d %d)\n", world_rank_, + schema[0], schema[1], size[0], size[1], size[2]); + array->init(schema[0], schema[1], size, COMPUTE_NODE); + free(schema); + } + + int array_bytes_to_go = array->array_info(); + while (array_bytes_to_go > 0) + process_compute_message(array_bytes_to_go, array); +} + +void Simple_IO::process_compute_message(int &arrays_bytes_to_go, + Array *array) +{ + int msg_code, msg_tag, msg_src; + MPI_Status status; + int data_size; + + any_new_message(&msg_code, &msg_src, &msg_tag, &status); + switch(msg_code){ + case CHUNK_SCHEMA: + /* Do something about it */ + process_chunk_schema_request(msg_src,msg_tag, arrays_bytes_to_go, + &status, array); + break; + + case CHUNK_DATA_FROM_IO: + MPI_Get_count(&status, MPI_CHAR, &data_size); + printf("Received chunk_data before chunk schema from %d of size %d\n", + msg_src, data_size); + MPI_Probe(msg_src, (msg_tag/10)*10+CHUNK_SCHEMA, MPI_COMM_WORLD, &status); + printf("Received the corressponding chunk schema message\n"); + process_chunk_schema_request(msg_src, (msg_tag/10)*10+CHUNK_SCHEMA, + arrays_bytes_to_go, + &status, array); + break; + + case NO_MESSAGE: + /* Do nothing */ + break; + default: + /* This message is not for me */ + printf("In process compute message - unknown code %d\n", msg_code); + break; + } +} + +void Simple_IO::process_chunk_schema_request(int msg_src, int msg_tag, + int &array_bytes_to_go, + MPI_Status *status, Array *array) +{ + int *schema_buf, schema_size; + int chunk_id, op_type, array_rank, *base, *size, *stride, *ptr; + int data_size, elt_size, offset; + Boolean contiguous; + MPI_Datatype datatype; + Chunk *chunk; + void *data_ptr; + + MPI_Get_count(status, MPI_INT, &schema_size); + schema_buf = (int *) malloc(sizeof(int)*schema_size); + receive_message((void *)schema_buf, schema_size, MPI_INT, msg_src, + msg_tag, MPI_COMM_WORLD, status); + + ptr = schema_buf; + chunk_id = *ptr++; + ptr++; + contiguous = (Boolean) *ptr++; + chunk = array->find_chunk(chunk_id); + + if (contiguous){ + op_type = *ptr++; + offset = *ptr++; + data_size = *ptr++; + data_ptr = chunk->data_ptr(); + data_ptr = (char *)((char *) data_ptr + offset); + + if ((op_type == RESTART) || (op_type == READ_TIMESTEP) || + (op_type == GENERAL_READ)) + receive_message((void *) data_ptr, + data_size, + MPI_CHAR, msg_src, + (msg_tag/10*10)+CHUNK_DATA_FROM_IO, + MPI_COMM_WORLD, status); + else + send_message((void *)data_ptr, data_size,MPI_CHAR, msg_src, + (msg_tag/10)*10+CHUNK_DATA_TO_IO, + MPI_COMM_WORLD); + + } + else{ + array_rank = *ptr++; + op_type = *ptr++; + base = &ptr[0]; + size = &ptr[array_rank*1]; + stride = &ptr[array_rank*2]; + elt_size = chunk->element_size(); + data_size = num_elements(array_rank, size)*elt_size; + + chunk->make_datatype(base,size,stride, &data_ptr, &datatype); + if ((op_type == RESTART) || (op_type == READ_TIMESTEP) || + (op_type == GENERAL_READ)) + receive_message(data_ptr, 1, datatype,msg_src, + (msg_tag/10)*10+CHUNK_DATA_FROM_IO, + MPI_COMM_WORLD, status); + else + send_message(data_ptr, 1, datatype, msg_src, + (msg_tag/10)*10+CHUNK_DATA_TO_IO, + MPI_COMM_WORLD); + MPI_Type_free(&datatype); + } + + array_bytes_to_go -= data_size; + free(schema_buf); +} diff --git a/src/Panda/Simple_IO.h b/src/Panda/Simple_IO.h new file mode 100644 index 0000000..4df4831 --- /dev/null +++ b/src/Panda/Simple_IO.h @@ -0,0 +1,91 @@ +#ifndef Simple_IO_dot_h +#define Simple_IO_dot_h + +#include "Collective_IO.h" + + +class ArrayGroup; +class Array; +class Chunk; +class App_Info; + +//#include "../IEEEIO/IEEEIO.h" +//#include "../IEEEIO/IOProtos.h" + +class Simple_IO : public Collective_IO +{ + protected: + Boolean dummy_; /* Do the instance variables mean anything */ + int *schema_string_; + int schema_size_; + int *current_schema_ptr_; + Array *current_array_; + Chunk *current_chunk_; + int num_of_subchunks_; + int current_subchunk_id_; + int num_of_chunks_; + int current_chunk_id_; + IOFile file_ptr_; + FILE *schema_file_ptr_; + int num_io_nodes_; + int my_io_rank_; + int compute_app_num_; + App_Info *app_info_; + Boolean part_time_io_; + Array *compute_node_array_; + int op_type_; + Boolean nat_chunked_; + Boolean sub_chunked_; + Boolean overlaped_; + Boolean contiguous_; + int world_rank_; + + int num_overlaps_; + int max_overlaps_; + int *overlap_chunk_ids_; + int *dest_ids_; + int **schema_bufs_; + MPI_Request *schema_requests_; + MPI_Request *requests_; + MPI_Status *statuses_; + MPI_Datatype *datatypes_; + int max_rank_; + int array_rank_; + int *overlap_base_; + int *overlap_stride_; + int *overlap_size_; + char **data_ptrs_; + char *mem_buf_; + int mem_buf_size_; + + + void realloc_buffers(int); + void compute_chunk_overlaps(Array*,Chunk*); + void compute_schemas(Array*,Chunk*,Chunk*); + virtual void send_schema_message(int); + void make_datatype(Chunk*,int); + void receive_data(Chunk*,int, int&); + void send_data(Chunk*, int, int&); + void read_data(Chunk*); + void read_data(char*,int,int); + void write_data(char*,int,int); + void write_data(Chunk*); + void copy_data(Chunk*,int,Boolean,int&); + void direct_io(int,Boolean,int&); + void free_datatypes(); + void wait_for_completion(int&,Array*); + void send_data_to_compute_nodes(Chunk*, int&); + void receive_data_from_compute_nodes(Chunk*, int&); + void realloc_schema_bufs(int); + void realloc_mem_bufs(int); + void process_compute_message(int&,Array*); + void process_chunk_schema_request(int,int,int&,MPI_Status*,Array*); + public: + Simple_IO(); + Simple_IO(int*,int,int,int, int , App_Info*, IOFile); + virtual ~Simple_IO(); + virtual void start_to_finish(Boolean part_time_io,Array*); + virtual void compute_node_io_loop(Array*); +}; + +#endif diff --git a/src/Panda/StopWatch.h b/src/Panda/StopWatch.h new file mode 100644 index 0000000..e38c5d7 --- /dev/null +++ b/src/Panda/StopWatch.h @@ -0,0 +1,34 @@ +#ifndef StopWatch_dot_h +#define StopWatch_dot_h + +#include <stdio.h> +#include <mpi.h> + +class StopWatch +{ + private: + double start_t,finish_t; + char description[200]; + + public: + StopWatch () { start_t = finish_t = -1; } + ~StopWatch() { }; + void start() { start_t = MPI_Wtime(); } + void stop (char *desc) + { + finish_t = MPI_Wtime(); + if (start_t == -1.0) + fprintf(stderr, "StopWatch: must start before stop\n"); + else + sprintf(description, "%s elapsed time: %f, (%f, %f)\n" + ,desc + ,finish_t-start_t + ,start_t, finish_t); + start_t = finish_t = -1; + } + char *get_description() { return description;} + +}; + + +#endif diff --git a/src/Panda/Template.C b/src/Panda/Template.C new file mode 100644 index 0000000..5600e2f --- /dev/null +++ b/src/Panda/Template.C @@ -0,0 +1,40 @@ +#include "definitions.h" +#include "Template.h" + +Template::Template(int Rank, int *sizearray) +{ + rank_ = Rank; + if (sizearray) size_ = copy_int_list(Rank, sizearray); +} + +Template::Template() +{ + rank_ = 0; + size_ = NULL; +} + +Template::~Template() +{ + if (size_ != NULL) free(size_); + size_ = NULL; +} + +int Template::rank() +{ + return rank_; +} + +int* Template::size() +{ + return size_; +} + +int Template::total_elements() +{ + return num_elements(rank_, size_); +} + +int Template::size(int i) +{ + return size_[i]; +} diff --git a/src/Panda/Template.h b/src/Panda/Template.h new file mode 100644 index 0000000..ff9483a --- /dev/null +++ b/src/Panda/Template.h @@ -0,0 +1,22 @@ +#ifndef template_dot_h +#define template_dot_h + +#include<malloc.h> + +class Template { + protected: + int rank_; + int *size_; + + public: + Template(); + Template(int Rank, int *sizearray); + virtual ~Template(); + int rank(); + int* size(); + int size(int); + int total_elements(); +}; + +#endif + diff --git a/src/Panda/VirtFS.C b/src/Panda/VirtFS.C new file mode 100644 index 0000000..f220392 --- /dev/null +++ b/src/Panda/VirtFS.C @@ -0,0 +1,4 @@ +#include "VirtFS.h" + +// jozwiak 030295 +// there should be nothing here of substance...since VirtFS is purely virtual diff --git a/src/Panda/VirtFS.h b/src/Panda/VirtFS.h new file mode 100644 index 0000000..8b13a14 --- /dev/null +++ b/src/Panda/VirtFS.h @@ -0,0 +1,14 @@ +#ifndef VirtFS_dot_h +#define VirtFS_dot_h + +#include <stdio.h> + + +class VirtFS +{ + public: + VirtFS(){} ; + virtual ~VirtFS(){} ; +}; + +#endif diff --git a/src/Panda/c_interface.C b/src/Panda/c_interface.C new file mode 100644 index 0000000..b6085d6 --- /dev/null +++ b/src/Panda/c_interface.C @@ -0,0 +1,172 @@ +/* This is a simple interface for c applications to use Panda */ +#include <stdio.h> +#include <stdlib.h> +#include "definitions.h" +#include "Panda.h" +#include "MPIFS.h" +#include "ArrayLayout.h" +#include "Array.h" + + +extern "C" { int Panda_Create(int, int); } +extern "C" { void Panda_Finalize(); } +extern "C" { void PandaTimestep(struct ArrayInfo *); } +extern "C" { void *PandaReadTimestep(struct ArrayInfo *); } +extern "C" { void Panda_WriteAttribute(char *, char *, int, int, void *); } +extern "C" { void *Panda_ReadAttribute(char *, char *, int *, int *); } +extern "C" { Boolean PandaIsNewFile(char *); } + +typedef struct ArrayInfo { + char* name_; /* array name */ + int rank_; /* rank */ + int* size_; /* glbal size of the array */ + int esize_; /* size of each element */ + int mem_rank_; /* compute processor topology - rank */ + int* mem_layout_; /* compute processor topology - mesh */ + Distribution* mem_dist_; /* compute processor topology - dist */ + int disk_rank_; /* io processor topology - rank */ + int* disk_layout_; /* io processor topology - mesh */ + Distribution* disk_dist_; /* io processor topology - dist */ + char* data_; /* data pointer belonging to me */ + int stencil_width_; /* stencil width */ + struct ArrayInfo *next_; /* next element */ +} ArrayInfo; + +Panda *global_bear = NULL; +extern MPIFS *MPIFS_global_obj; + +int Panda_Create(int ioproc_every, int is_part_time_mode) +{ + int i, my_app_size, my_rank, *world_ranks; + int io_nodes; + +/* if (io_nodes > 1) { + printf("Warning: Write Chunks instead of Write arrays.\n"); + printf("There might be errors in Attributes write\n"); + }*/ + + MPI_Comm_size(MPI_COMM_WORLD, &my_app_size); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + world_ranks = (int *) malloc(sizeof(int)*my_app_size); + for (i=0; i<my_app_size; i++) world_ranks[i] = i; + io_nodes = (my_app_size - 1) / ioproc_every + 1; + + if (is_part_time_mode) { + if (my_rank < io_nodes) { /* part-time io nodes */ + global_bear = new Panda(PART_TIME_IO, my_rank, my_app_size, world_ranks, + my_rank, io_nodes, world_ranks); + //printf("##### Panda proc %d/%d PART_TIME_IO\n", my_rank, my_app_size); + } else { /* part-time compute nodes */ + global_bear = new Panda(PART_TIME_COMPUTE, my_rank, my_app_size, + world_ranks, -1, io_nodes, world_ranks); + //printf("##### Panda proc %d/%d PART_TIME_COMPUTE\n", my_rank, my_app_size); + } + } else { +// printf("Warning: Full-time I/O nodes is not integrated with Cactus yet, "); +// printf("due to the communicator problem. Panda's part is done, though\n"); + if (my_rank < io_nodes) { /* full-time io nodes */ + global_bear = new Panda(IO_NODE, 0, my_rank, io_nodes, world_ranks); + delete global_bear; + free(world_ranks); + return 1; + } else { /* compute nodes */ + for (i=0; i<(my_app_size-io_nodes); i++) world_ranks[i] += io_nodes; + global_bear = new Panda(COMPUTE_NODE, 1, my_rank-io_nodes, + my_app_size-io_nodes, world_ranks); + } + } + free(world_ranks); + return 0; +} + +void Panda_Finalize() +{ + if (global_bear) delete global_bear; +} + + +void PandaTimestep(ArrayInfo *ptr) +{ +/* Test if Panda_Create() has been called */ + if (global_bear == NULL) { + printf("Panda object is not created yet - Use Panda_Create(...)\n"); + return; + } + +/* Create array information */ + ArrayLayout *mem_layout, *disk_layout; + Array *array; + + mem_layout = new ArrayLayout(ptr->mem_rank_, ptr->mem_layout_); + disk_layout = new ArrayLayout(ptr->disk_rank_, ptr->disk_layout_); + + array = new Array(ptr->name_, ptr->rank_, ptr->size_, ptr->esize_, + mem_layout, ptr->mem_dist_, + disk_layout, ptr->disk_dist_, + ptr->data_, ptr->stencil_width_); + + global_bear->app_barrier(); + printf("---------------- Panda Timestep -------------------\n"); + //printf("name %s rank %d size %d %d %d esize %d mem_layout %d %d %d disk_layout %d stencil_width_ %d\n", ptr->name_, ptr->rank_, ptr->size_[0], ptr->size_[1], ptr->size_[2], ptr->esize_, ptr->mem_layout_[0], ptr->mem_layout_[1], ptr->mem_layout_[2], ptr->disk_layout_[0], ptr->stencil_width_); + + array->timestep(); + + delete mem_layout; + delete disk_layout; + delete array; +} + +void *PandaReadTimestep(ArrayInfo *ptr) +{ +/* Test if Panda_Create() has been called */ + if (global_bear == NULL) { + printf("Panda object is not created yet - Use Panda_Create(...)\n"); + return NULL; + } + +/* Create array information */ + ArrayLayout *mem_layout, *disk_layout; + Array *array; + + mem_layout = new ArrayLayout(ptr->mem_rank_, ptr->mem_layout_); + disk_layout = NULL; + array = new Array(ptr->name_, ptr->rank_, ptr->size_, ptr->esize_, + mem_layout, ptr->mem_dist_, + disk_layout, ptr->disk_dist_, + ptr->data_, ptr->stencil_width_); + + printf("---------------- Panda ReadTimestep -------------------\n"); + global_bear->app_barrier(); + array->read_timestep(); + void *data = (void *)array->get_data_ptr(); + array->set_data_ptr(NULL); + + delete mem_layout; + delete array; + return data; +} + +void Panda_WriteAttribute(char *fname, char *name, int esize, + int count, void *data) +{ + Attribute *attr = new Attribute(); + attr->write(fname, name, esize, count, data); + delete attr; +} + +void *Panda_ReadAttribute(char *fname, char *name, int *type, int *count) +{ + Attribute *attr = new Attribute(); + attr->read(fname, name); + void *data = attr->get_data_ptr(); + attr->set_data_ptr(NULL); + *type = attr->esize(); + *count = attr->count(); + delete attr; + return data; +} + +Boolean PandaIsNewFile(char *fname) +{ + return MPIFS_global_obj->is_new_file(fname); +} diff --git a/src/Panda/c_interface.h b/src/Panda/c_interface.h new file mode 100644 index 0000000..b167f6f --- /dev/null +++ b/src/Panda/c_interface.h @@ -0,0 +1,28 @@ +#ifndef _included_C_Interface_h +#define _included_C_Interface_h + +#include "external/IEEEIO/src/IEEEIO.h" + + +typedef enum { NONE, + BLOCK, + GENERAL, + CYCLIC + } Distribution; +typedef struct ArrayInfo { + char* name_; /* array name */ + int rank_; /* rank */ + int* size_; /* glbal size of the array */ + int esize_; /* size of each element */ + int mem_rank_; /* compute processor topology - rank */ + int* mem_layout_; /* compute processor topology - mesh */ + Distribution* mem_dist_; /* compute processor topology - dist */ + int disk_rank_; /* io processor topology - rank */ + int* disk_layout_; /* io processor topology - mesh */ + Distribution* disk_dist_; /* io processor topology - dist */ + char* data_; /* data pointer belonging to me */ + int stencil_width_; /* stencil width */ + struct ArrayInfo *next_; /* next element */ +} ArrayInfo; + +#endif diff --git a/src/Panda/compute_test.C b/src/Panda/compute_test.C new file mode 100644 index 0000000..fc61f34 --- /dev/null +++ b/src/Panda/compute_test.C @@ -0,0 +1,350 @@ +/***************************************************************** + * This is a sample program that shows how the panda library * + * is going to be used by the application programs. * + * The example command line format is in test7.script. * + * This example shows the interface with only disk layout * + * info but no stride or subchunking schema. The value for * + * those schemas use the default ones. * + * The current test varies the size of arrays. However, the * + * wrapper function allows the number of the nodes to be * + * changed as well. * + * The first iteration loads all the code in memory. * + * The second run does the simulated disk simulation. * + * From the third run on, the values are the real writes. * + *****************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include "definitions.h" +#include "StopWatch.h" +#include "ArrayGroup.h" +#include "ArrayLayout.h" +#include "Array.h" +#include "Panda.h" + +int Num_of_Arrays = 1; +int Num_Simulate_Read = 0; +int Num_Read = 0; +int Num_Simulate_Write = 2; +int Num_Write = 2 ; +int interleave = 0; +Panda *global_bear; +extern int SUBCHUNK_SIZE; +int STRATEGY = 1; + +void test_timestep(ArrayGroup *t1, int arraysize, Array **arrays) +{ + StopWatch timer; + int i; + int flag=0; + char time_message[100]; + +#ifdef VERIFYBF + for (int j=0; j<Num_of_Arrays; j++) arrays[j]->set_byte_pattern(); + t1->set_verify(); +#endif + + + global_bear->cleanfiles(); + global_bear->createfiles(); + + + for (i=0; i<Num_Simulate_Write+Num_Write; ++i) { + if (i < Num_Simulate_Write){ + t1->set_simulate_mode(); + flag=0; + } + else { + t1->reset_simulate_mode(); + flag=1; + } + + + global_bear->app_barrier(); + t1->set_io_strategy(STRATEGY); + timer.start(); + t1->timestep(); + timer.stop(":"); + sprintf(time_message,"%s Write: SIZE: %d, Time %i %s", + (flag==0? "Simulated":"Real"), + arraysize, i, timer.get_description()); + printf("%s", time_message); + + if (Num_Read + Num_Simulate_Read == 0 || i < Num_Simulate_Write + Num_Write-1 ) { + global_bear->cleanfiles(); + global_bear->createfiles(); + + } + } +} + +void test_readtimestep(ArrayGroup *r1, int arraysize, Array **arrays) +{ + StopWatch timer; + int i; + int flag; + char time_message[100]; +#ifdef VERIFYBF + for (int j=0; j<Num_of_Arrays; j++) arrays[j]->reset_byte_pattern(); +#endif + + if (Num_Write + Num_Simulate_Write == 0) { + global_bear->cleanfiles(); + global_bear->createfiles(); + } + + + + for (i=0; i<Num_Simulate_Read+Num_Read; ++i) { + if (i < Num_Simulate_Read) { r1->set_simulate_mode(); flag=0; } + else {r1->reset_simulate_mode(); + flag=1; + global_bear->flushfiles(); + } + + + global_bear->app_barrier(); + r1->set_io_strategy(STRATEGY); + timer.start(); + r1->read_timestep(); + timer.stop(":"); + + sprintf(time_message,"%s Read: SIZE: %d, Time %i %s ", + (flag==0? "Simulated":"Real"), + arraysize, i, timer.get_description()); + printf("%s", time_message); + } +#ifdef VERIFYBF + for(i=0;i<Num_of_Arrays;i++) + if (arrays[i]->verify_byte_pattern()) + printf("Byte pattern verified for array %d\n", i); + else + printf("Byte pattern incorrect for array %d\n", i); +#endif + global_bear->cleanfiles(); +} + + +int gemein(Panda *bear, int io_nodes, int arrayrank, int *arraysize, int esize, + int mrank, int *mlayout, int drank, int *dlayout, + Distribution *mem_dist, Distribution *disk_dist, int cost_model) +{ + ArrayLayout *mem1; // Memory array layout + ArrayLayout *disk1; // Disk array layout + int i; + Array **arrays; + arrays = (Array **)malloc(sizeof(Array*)*Num_of_Arrays); + +// Set up memory and disk layouts + mem1 = new ArrayLayout (mrank,mlayout); + disk1 = new ArrayLayout(drank,dlayout); + +// Create an Array for computation. + char *name; + name = (char *)malloc(sizeof(char)*(strlen("z1Array")+5)); + char temp[5]; + for (i=0; i< Num_of_Arrays; i++) { + strcpy(name,"z1Array"); + sprintf(temp, "%d", i); + strcat(name, temp); + arrays[i] = new Array(name,arrayrank,arraysize,esize, + mem1,mem_dist,disk1, disk_dist); + } + free(name); + + if (Num_Simulate_Write + Num_Write > 0) { + ArrayGroup *t1 = new ArrayGroup("z4timestep"); + for (i= 0; i<Num_of_Arrays; i++) t1->insert(arrays[i]); + test_timestep(t1, arraysize[arrayrank-1], arrays); + delete t1; + if (Num_Simulate_Read + Num_Read > 0) { + ArrayGroup *r1 = new ArrayGroup("z4timestep"); + for (i= 0; i<Num_of_Arrays; i++) r1->insert(arrays[i]); + test_readtimestep(r1, arraysize[arrayrank-1], arrays); + delete r1; + } + } else { + + ArrayGroup *r1 = new ArrayGroup("z4timestep"); + for (i= 0; i<Num_of_Arrays; i++) r1->insert(arrays[i]); + test_readtimestep(r1, arraysize[arrayrank-1], arrays); + delete r1; + } + + // delete all objects created + + for (i=0; i<Num_of_Arrays; i++) delete arrays[i]; + free(arrays); + delete disk1; + delete mem1; + return(0); +} + +char my_getopt(char *str) +{ + char command[23][15]; + + strcpy(command[0], "-Total_nodes"); + strcpy(command[1], "-Io_nodes"); + strcpy(command[2], "-upper"); + strcpy(command[3], "-Arraysize"); + strcpy(command[4], "-Esize"); + strcpy(command[5], "-Mlayout"); + strcpy(command[6], "-Dlayout"); + strcpy(command[7], "-mem_dist"); + strcpy(command[8], "-disk_dist"); + strcpy(command[9], "-num_arrays"); + strcpy(command[10], "-read_simulate"); + strcpy(command[11], "-Read"); + strcpy(command[12], "-write_simulate"); + strcpy(command[13], "-Write"); + strcpy(command[14], "-interleave"); + strcpy(command[15], "-Cost_model"); + strcpy(command[16], "-size_message"); + strcpy(command[17], "-Xfactor"); + + for (int i= 0; i< 18; i++) + if (!strncmp(str, command[i], 2)) return command[i][1]; + printf("undefined input %s, quit!\n",str); + exit(0); +} + +void parse_cl(int argc, char **argv, int &total_nodes, int &io_nodes, + int &upper_bound, int &lower_bound, int &arrayrank, int*& arraysize, + int &esize, int &mrank, int*& mlayout, int& drank, int*& dlayout, + Distribution*& mem_dist, Distribution*& disk_dist, int &cost_model_mode) +{ + char opt; + int k; + + for (int i=1; i<argc; ) { + opt = my_getopt(argv[i++]); + switch(opt) { + case 'X': + STRATEGY = atoi(argv[i++]); + break; + case 'T': + total_nodes = atoi(argv[i++]); + break; + case 'I': + io_nodes = atoi(argv[i++]); + break; + case 'u': + upper_bound = atoi(argv[i++]); + break; + case 'A': + arrayrank = atoi(argv[i++]); + arraysize = (int *) malloc(sizeof(int)* arrayrank); + mem_dist = (Distribution *)malloc(sizeof(Distribution)*arrayrank); + disk_dist = (Distribution *)malloc(sizeof(Distribution)*arrayrank); + for (k = 0; k < arrayrank; k++) arraysize[k] = atoi(argv[i++]); + lower_bound = arraysize[k-1]; + break; + case 'E': + esize = atoi(argv[i++]); + break; + case 'M': + mrank = atoi(argv[i++]); + mlayout = (int *) malloc(sizeof(int)* mrank); + for (k = 0; k < mrank; k++) mlayout[k] = atoi(argv[i++]); + break; + case 'D': + drank = atoi(argv[i++]); + dlayout = (int *) malloc(sizeof(int)* drank); + for (k = 0; k < drank; k++) dlayout[k] = atoi(argv[i++]); + break; + case 'm': + for (k = 0; k < arrayrank; k++) mem_dist[k] = (Distribution)atoi(argv[i++]); + break; + case 'd': + for (k = 0; k < arrayrank; k++) disk_dist[k] = (Distribution)atoi(argv[i++]); + break; + case 'n': + Num_of_Arrays = atoi(argv[i++]); + break; + case 'r': + Num_Simulate_Read = atoi(argv[i++]); + break; + case 'R': + Num_Read = atoi(argv[i++]); + break; + case 'w': + Num_Simulate_Write = atoi(argv[i++]); + break; + case 'W': + Num_Write = atoi(argv[i++]); + break; + case 'i': + interleave = atoi(argv[i++]); + break; + case 'C': + cost_model_mode = atoi(argv[i++]); + break; + case 's': + SUBCHUNK_SIZE = atoi(argv[i++]); + break; + /* For Panda internal library stuff */ + + } + } +} + +int main(int argc, char **argv) +{ + int total_nodes; // The number of total nodes (comp + io) + int io_nodes; // The number of io nodes + int upper_bound; // The upper bound of the last dimension of the array + int lower_bound; // The starting number of the last dimension of the array + int arrayrank ; // The array rank. + int *arraysize; // The number of elements along each array dimention + int esize ; // element size of each array element + int mrank ; // Compute node mesh rank + int *mlayout; // Compute node mesh layout + int drank ; // IO node mesh rank + int cost_model_mode; // Whether the cost model is included. + int *dlayout; // IO node mesh layout + Distribution *mem_dist; // The memory array distribution along each dimention + // There are three possible distributions (BLOCK, + // NONE, CYCLIC). + Distribution *disk_dist; // The disk array distribution along each dimention + int my_rank, my_app_size, *world_ranks, leader; + + + MPI_Init(&argc, &argv); + +// For Parallel architecture (IBM SP2 like), +// Initialize the MPI environment. Only compute nodes will return from +// this call, the io nodes will not return from the call. All the io nodes + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &my_app_size); + + parse_cl(argc, argv, total_nodes, io_nodes, upper_bound, lower_bound, arrayrank, + arraysize, esize, mrank, mlayout, drank, dlayout, mem_dist, disk_dist, cost_model_mode); + + leader = io_nodes; + world_ranks = (int *) malloc(sizeof(int)*my_app_size); + for(int i=0;i< my_app_size; i++) + world_ranks[i] = leader+i; + + Panda * bear = new Panda(COMPUTE_NODE, 1, my_rank, my_app_size, + world_ranks); + global_bear = bear; + + for (int size=lower_bound; size <= upper_bound; size*=2) { + arraysize[arrayrank-1] = size; + gemein(bear,io_nodes, arrayrank, arraysize, esize, + mrank, mlayout, drank, dlayout, mem_dist, disk_dist, cost_model_mode); + } + free(arraysize); + free(mlayout); + free(dlayout); + free(mem_dist); + free(disk_dist); + free(world_ranks); + delete bear; + MPI_Finalize(); + return(0); +} diff --git a/src/Panda/configure b/src/Panda/configure new file mode 100644 index 0000000..f34cfdd --- /dev/null +++ b/src/Panda/configure @@ -0,0 +1,75 @@ +#!/bin/sh +# this is a script that is intended to guide the procession of +# our makefiles in an independent way across multiple OS platforms +# and multiple hardware platform...(first for sun) +echo "checking out target machine:" +X="os-detected" +MY_OS="" + +if [ `uname -a | fgrep -i sun | wc -l` -ne 0 ] ; then + MY_OS="sunos" +fi + +if [ `uname -a | fgrep -i aix | wc -l` -ne 0 ] ; then + MY_OS="aix" +fi + +if [ `uname -a | fgrep -i hp-ux | wc -l` -ne 0 ] ; then + MY_OS="hp-ux" +fi + +if [ `uname -a | fgrep -i irix | wc -l` -ne 0 ] ; then + MY_OS="irix" +fi + +/bin/rm -fr makefile + +case $MY_OS in + "sunos") + echo " detected SunOS..." + echo $MY_OS > $X + echo "include makefile.sun.mpich" > makefile + ;; + "aix") + echo " detected AIX..." + echo $MY_OS > $X + echo "include makefile.ibm.mpif" > makefile + ;; + "irix") + echo " detected IRIX..." + echo $MY_OS > $X + echo "include makefile.sgi.mpich" > makefile + ;; + "hp-ux") + echo " detected HP-UX..." + echo $MY_OS > $X + echo "include makefile.hpux.mpich" > makefile + ;; + *) + echo "Hey, I don't know this operating system..." + echo $MY_OS > $X + echo "include makefile.unix.posix" > makefile + ;; +esac + +cat makefile.proto >> makefile + +case $MY_OS in + "irix") # those folks busted "which" + FOUND_MPI=`which -f mpirun |wc |awk '{print $2}'` + ;; + *) + FOUND_MPI=`which mpirun |wc |awk '{print $2}'` + ;; +esac + +if [ $FOUND_MPI -ne 0 ] ; then + echo " found MPI..." +fi +FP='/scratch-modi4/'`whoami`'/' +mkdir $FP >/dev/null 2>&1 +echo " user temp directory $FP exists..." +echo 'FILEPREFIXVAL=\"'$FP'\"' > fileprefix +echo "the file \"makefile\" is now configured for target." + +exit 0 diff --git a/src/Panda/definitions.h b/src/Panda/definitions.h new file mode 100644 index 0000000..32c4da0 --- /dev/null +++ b/src/Panda/definitions.h @@ -0,0 +1,186 @@ +#ifndef definitions_dot_h +#define definitions_dot_h + +#include<stdio.h> +#include<stdlib.h> +#include<string.h> + +#include "cctk.h" + +extern "C" {int fsync(int f);} + + + +#define START 0 +#define WAITING 1 + +/* Different I/O strategies */ +#define SIMPLE_IO 1 +#define CSDIO_IO 2 + +/* The different possible nodetypes */ +#define COMPUTE_NODE 0 +#define IO_NODE 1 +#define PART_TIME_COMPUTE 2 +#define PART_TIME_IO 3 +#define SUB_CHUNK 4 +#define PART_TIME 5 + + +/* Unix or MPI based file system */ +#define MPI_SYSTEM 0 +#define UNIX_SYSTEM 1 + +/* Different kinds of collective I/O operations */ +#define RESTART 0 +#define READ_TIMESTEP 1 +#define GENERAL_READ 2 +#define CHECKPOINT 3 +#define TIMESTEP 4 +#define GENERAL_WRITE 5 + + +/* Tags to indicate the type of the message */ + +/* #define NO_MESSAGE 10 + #define SPECIAL 9 + #define ARRAYGROUP_SCHEMA 8 + #define CHUNK_DATA_TO_IO 7 + #define APP_IO_DONE 6 + #define QUIT 5 + #define COMP_QUIT 4 + #define CHUNK_SCHEMA 3 + #define CHUNK_DATA_FROM_IO 2 + #define CHUNK_SCHEMA_DATA 1 +*/ +/* Modified it to make it compatible with my thesis */ +#define CHUNK_SCHEMA 1 +#define CHUNK_DATA_FROM_IO 2 +#define CHUNK_DATA_TO_IO 3 + +#define COMP_QUIT 4 +#define QUIT 5 +#define ATTRIBUTE_SCHEMA 6 +#define ATTRIBUTE_DATA 7 + +#define ARRAYGROUP_SCHEMA 8 +#define SPECIAL 9 +#define NO_MESSAGE 10 + +/* Tags to indicate the type of special operatiosn required */ +#define APP_INFO 1 +#define APP_BARRIER 2 +#define GLOBAL_BARRIER 3 +#define CLEANFILES 4 +#define FLUSHFILES 5 +#define CREATEFILES 6 + +typedef enum { UNSET, + Regular, + Irregular + } Distribution_Type; + +typedef enum { NONE, + BLOCK, + GENERAL, + CYCLIC + } Distribution; + +typedef enum { HPF, + NAS, + GENERAL_BLOCK + } Block_Distribution; + +typedef enum { ROUND_ROBIN, + REGULAR + } ChunkAllocPolicy; + + +typedef enum { NO = 0, + YES = 1 + } Boolean; + + +typedef enum { ALLOC, + NO_ALLOC, + SHARED + } DataStatus; + + + +inline int max(int a, int b) +{ + if (a > b) return a; + else return b; +} + +inline int min(int a, int b) +{ + if (a < b) return a; + else return b; +} + + +inline int* copy_int_list(int s, int *l) +{ + int *ret_list = (int *) malloc(sizeof(int)*s); + for(int i=0;i<s;i++) + ret_list[i] = l[i]; + return ret_list; +} + + + +inline Distribution* copy_distribution(int num, Distribution *ptr) +{ + Distribution *ret_list = (Distribution *)malloc(sizeof(Distribution)*num); + + for(int i=0; i < num; i++) + ret_list[i] = ptr[i]; + + return ret_list; +} + + +inline Boolean equal_distribution(int size, Distribution* dist1, Distribution* dist2) +{ + for(int i=0; i < size; i++) + { + if (dist1[i] != dist2[i]) + return NO; + } + return YES; +} + +inline void pack_distribution(int **schema_buf, int rank, Distribution *in_dist) +{ + Distribution *dist = in_dist; + int* ptr = *schema_buf; + + for(int i=0;i<rank;i++) + *ptr++ = (int) dist[i]; + *schema_buf = ptr; +} + +inline Distribution* new_distribution(int **schema_buf, int rank) +{ + Distribution *dist = (Distribution*) malloc(sizeof(Distribution)*rank); + int *ptr = *schema_buf; + + for(int i=0;i<rank;i++) + dist[i] = (Distribution) *ptr++; + + *schema_buf = ptr; + return dist; + +} + +inline int num_elements(int r, int *size) +{ + int total=1; + for(int i=0;i<r;i++) total *= size[i]; + return total; +} + + +#endif diff --git a/src/Panda/fulltime.C b/src/Panda/fulltime.C new file mode 100644 index 0000000..dd195f0 --- /dev/null +++ b/src/Panda/fulltime.C @@ -0,0 +1,410 @@ +/***************************************************************** + * This is a sample program that shows how the panda library * + * is going to be used by the application programs. * + * The example command line format is in test7.script. * + * This example shows the interface with only disk layout * + * info but no stride or subchunking schema. The value for * + * those schemas use the default ones. * + * The current test varies the size of arrays. However, the * + * wrapper function allows the number of the nodes to be * + * changed as well. * + * The first iteration loads all the code in memory. * + * The second run does the simulated disk simulation. * + * From the third run on, the values are the real writes. * + *****************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include "definitions.h" +#include "StopWatch.h" +#include "ArrayGroup.h" +#include "ArrayLayout.h" +#include "Array.h" +#include "Panda.h" + +int Num_of_Arrays = 1; +int Num_Simulate_Read = 0; +int Num_Read = 0; +int Num_Simulate_Write = 2; +int Num_Write = 2 ; +int interleave = 0; +Panda *global_bear; +extern int SUBCHUNK_SIZE; +int STRATEGY = 1; +int BLK; + +int CYCLIC_ON_MEM = 0; + +void test_timestep(ArrayGroup *t1, int arraysize, Array **arrays) +{ + StopWatch timer; + int i; + int flag=0; + char time_message[100]; + +#ifdef VERIFYBF + for (int j=0; j<Num_of_Arrays; j++) arrays[j]->set_byte_pattern(); + t1->set_verify(); +#endif + + + global_bear->cleanfiles(); + global_bear->createfiles(); + + + for (i=0; i<Num_Simulate_Write+Num_Write; ++i) { + if (i < Num_Simulate_Write){ + t1->set_simulate_mode(); + flag=0; + } + else { + t1->reset_simulate_mode(); + flag=1; + } + + + global_bear->app_barrier(); + t1->set_io_strategy(STRATEGY); + timer.start(); + t1->timestep(); + timer.stop(":"); + sprintf(time_message,"%s Write: SIZE: %d, BLK: %d, Time %i %s", + (flag==0? "Simulated":"Real"), + arraysize, BLK, i, timer.get_description()); + printf("%s", time_message); + + if (Num_Read + Num_Simulate_Read == 0 || i < Num_Simulate_Write + Num_Write-1 ) { + global_bear->cleanfiles(); + global_bear->createfiles(); + + } + } +} + +void test_readtimestep(ArrayGroup *r1, int arraysize, Array **arrays) +{ + StopWatch timer; + int i; + int flag; + char time_message[100]; +#ifdef VERIFYBF + for (int j=0; j<Num_of_Arrays; j++) arrays[j]->reset_byte_pattern(); +#endif + + if (Num_Write + Num_Simulate_Write == 0) { + global_bear->cleanfiles(); + global_bear->createfiles(); + } + + + + for (i=0; i<Num_Simulate_Read+Num_Read; ++i) { + if (i < Num_Simulate_Read) { r1->set_simulate_mode(); flag=0; } + else {r1->reset_simulate_mode(); + flag=1; + global_bear->flushfiles(); + } + + + global_bear->app_barrier(); + r1->set_io_strategy(STRATEGY); + timer.start(); + r1->read_timestep(); + timer.stop(":"); + + sprintf(time_message,"%s Read: SIZE: %d, Time %i %s ", + (flag==0? "Simulated":"Real"), + arraysize, i, timer.get_description()); + printf("%s", time_message); + } +#ifdef VERIFYBF + for(i=0;i<Num_of_Arrays;i++) + if (arrays[i]->verify_byte_pattern()) + printf("Byte pattern verified for array %d\n", i); + else + printf("Byte pattern incorrect for array %d\n", i); +#endif + global_bear->cleanfiles(); +} + + +int gemein(Panda *bear, int io_nodes, int arrayrank, int *arraysize, int esize, + int mrank, int *mlayout, int drank, int *dlayout, + Distribution *mem_dist, int* blk_size, Distribution *disk_dist, int cost_model) +{ + ArrayLayout *mem1; // Memory array layout + ArrayLayout *disk1; // Disk array layout + int i; + Array **arrays; + arrays = (Array **)malloc(sizeof(Array*)*Num_of_Arrays); + +// Set up memory and disk layouts + mem1 = new ArrayLayout (mrank,mlayout); + disk1 = new ArrayLayout(drank,dlayout); + +// Create an Array for computation. + char *name; + name = (char *)malloc(sizeof(char)*(strlen("z1Array")+5)); + char temp[5]; + for (i=0; i< Num_of_Arrays; i++) { + strcpy(name,"z1Array"); + sprintf(temp, "%d", i); + strcat(name, temp); + arrays[i] = new Array(name,arrayrank,arraysize,esize, + mem1,mem_dist,disk1, disk_dist); + } + free(name); + + if (Num_Simulate_Write + Num_Write > 0) { + ArrayGroup *t1 = new ArrayGroup("z4timestep"); + for (i= 0; i<Num_of_Arrays; i++) t1->insert(arrays[i]); + test_timestep(t1, arraysize[arrayrank-1], arrays); + delete t1; + if (Num_Simulate_Read + Num_Read > 0) { + ArrayGroup *r1 = new ArrayGroup("z4timestep"); + for (i= 0; i<Num_of_Arrays; i++) r1->insert(arrays[i]); + test_readtimestep(r1, arraysize[arrayrank-1], arrays); + delete r1; + } + } else { + + ArrayGroup *r1 = new ArrayGroup("z4timestep"); + for (i= 0; i<Num_of_Arrays; i++) r1->insert(arrays[i]); + test_readtimestep(r1, arraysize[arrayrank-1], arrays); + delete r1; + } + + // delete all objects created + + for (i=0; i<Num_of_Arrays; i++) delete arrays[i]; + free(arrays); + delete disk1; + delete mem1; + return(0); +} + +char my_getopt(char *str) +{ + char command[24][15]; + + strcpy(command[0], "-Total_nodes"); + strcpy(command[1], "-Io_nodes"); + strcpy(command[2], "-upper"); + strcpy(command[3], "-Arraysize"); + strcpy(command[4], "-Esize"); + strcpy(command[5], "-Mlayout"); + strcpy(command[6], "-Dlayout"); + strcpy(command[7], "-mem_dist"); + strcpy(command[8], "-disk_dist"); + strcpy(command[9], "-num_arrays"); + strcpy(command[10], "-read_simulate"); + strcpy(command[11], "-Read"); + strcpy(command[12], "-write_simulate"); + strcpy(command[13], "-Write"); + strcpy(command[14], "-interleave"); + strcpy(command[15], "-Cost_model"); + strcpy(command[16], "-size_message"); + strcpy(command[17], "-Xfactor"); + strcpy(command[18], "-K"); + + for (int i= 0; i< 24; i++) + if (!strncmp(str, command[i], 2)) return command[i][1]; + printf("undefined input %s, quit!\n",str); + exit(0); +} + +void parse_cl(int argc, char **argv, int &total_nodes, int &io_nodes, + int &upper_bound, int &lower_bound, int &arrayrank, int*& arraysize, + int &esize, int &mrank, int*& mlayout, int& drank, int*& dlayout, + Distribution*& mem_dist, int*& blk_size, + Distribution*& disk_dist, int &cost_model_mode, int &upper_blk) +{ + char opt; + int k; + + for (int i=1; i<argc; ) { + opt = my_getopt(argv[i++]); + switch(opt) { + case 'X': + STRATEGY = atoi(argv[i++]); + break; + case 'T': + total_nodes = atoi(argv[i++]); + break; + case 'I': + io_nodes = atoi(argv[i++]); + break; + case 'u': + upper_bound = atoi(argv[i++]); + break; + case 'A': + arrayrank = atoi(argv[i++]); + arraysize = (int *) malloc(sizeof(int)* arrayrank); + mem_dist = (Distribution *)malloc(sizeof(Distribution)*arrayrank); + blk_size = (int *) malloc(sizeof(int)*arrayrank); + disk_dist = (Distribution *)malloc(sizeof(Distribution)*arrayrank); + for (k = 0; k < arrayrank; k++) arraysize[k] = atoi(argv[i++]); + lower_bound = arraysize[k-1]; + break; + case 'E': + esize = atoi(argv[i++]); + break; + case 'M': + mrank = atoi(argv[i++]); + mlayout = (int *) malloc(sizeof(int)* mrank); + for (k = 0; k < mrank; k++) mlayout[k] = atoi(argv[i++]); + break; + case 'D': + drank = atoi(argv[i++]); + dlayout = (int *) malloc(sizeof(int)* drank); + for (k = 0; k < drank; k++) dlayout[k] = atoi(argv[i++]); + break; + case 'm': + for (k = 0; k < arrayrank; k++) + { + mem_dist[k] = (Distribution)atoi(argv[i++]); + } + break; + case 'd': + for (k = 0; k < arrayrank; k++) + { + disk_dist[k] = (Distribution)atoi(argv[i++]); + } + break; + + case 'n': + Num_of_Arrays = atoi(argv[i++]); + break; + case 'r': + Num_Simulate_Read = atoi(argv[i++]); + break; + case 'R': + Num_Read = atoi(argv[i++]); + break; + case 'w': + Num_Simulate_Write = atoi(argv[i++]); + break; + case 'W': + Num_Write = atoi(argv[i++]); + break; + case 'i': + interleave = atoi(argv[i++]); + break; + case 'C': + cost_model_mode = atoi(argv[i++]); + break; + case 's': + SUBCHUNK_SIZE = atoi(argv[i++]); + break; + + case 'K': + upper_blk = atoi(argv[i++]); + break; + + /* For Panda internal library stuff */ + + + } + } + + printf("####### io nodes=%d \n", io_nodes); +} + +int main(int argc, char **argv) +{ + int total_nodes; // The number of total nodes (comp + io) + int io_nodes; // The number of io nodes + int upper_bound; // The upper bound of the last dimension of the array + int lower_bound; // The starting number of the last dimension of the array + int arrayrank ; // The array rank. + int *arraysize; // The number of elements along each array dimention + int esize ; // element size of each array element + int mrank ; // Compute node mesh rank + int *mlayout; // Compute node mesh layout + int drank ; // IO node mesh rank + int cost_model_mode; // Whether the cost model is included. + int *dlayout; // IO node mesh layout + Distribution *mem_dist; // The memory array distribution along each dimention + // There are three possible distributions (BLOCK, + // NONE, CYCLIC). + int *blk_size; + Distribution *disk_dist; // The disk array distribution along each dimention + int my_rank, my_app_size, *world_ranks, leader; + + int upper_blk; // upper bound of the block size + int lower_blk; + Panda *bear; + + + MPI_Init(&argc, &argv); + +// For Parallel architecture (IBM SP2 like), +// Initialize the MPI environment. Only compute nodes will return from +// this call, the io nodes will not return from the call. All the io nodes + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &my_app_size); + leader = 0; + world_ranks = (int *) malloc(sizeof(int)*my_app_size); + + parse_cl(argc, argv, total_nodes, io_nodes, upper_bound, lower_bound, + arrayrank, arraysize, esize, mrank, mlayout, drank, dlayout, mem_dist, + blk_size, disk_dist, cost_model_mode, upper_blk); + + int q = total_nodes/io_nodes; + + for (int i=0; i<io_nodes; i++) + world_ranks[i] = i*q; + for (int j=0; j<io_nodes; j++) + for (int k=1; k< q; k++) + world_ranks[i++] = j*q + k; + +/* + world_ranks[0] = 0; + world_ranks[1] = 3; + world_ranks[2] = 1; + world_ranks[3] = 2; + world_ranks[4] = 4; + world_ranks[5] = 5; + + printf("myrank= %d, io_nodes=%d, total_nodes=%d \n", + my_rank, io_nodes, total_nodes); +*/ + printf("world ranks \n"); + for (i=0; i<my_app_size; i++) + printf(" %d", world_ranks[i]); + printf("\n\n"); + + if (my_rank % q == 0) + { // io nodes + bear = new Panda(IO_NODE, 0, my_rank/q, io_nodes, world_ranks); + global_bear = bear; + } + else + { // compute nodes + bear = new Panda(COMPUTE_NODE, 1, + my_rank/q*(q-1)+(my_rank-1)%q, + my_app_size-io_nodes, world_ranks+io_nodes); + global_bear = bear; + + for (int size=lower_bound; size <= upper_bound; size*=2) + { + arraysize[arrayrank-1] = size; + gemein(bear,io_nodes, arrayrank, arraysize, esize, + mrank, mlayout, drank, dlayout, mem_dist, blk_size, + disk_dist, cost_model_mode); + } + + } + + free(arraysize); free(mlayout); free(dlayout); free(mem_dist); + free(blk_size); + free(disk_dist); + free(world_ranks); + delete bear; + + MPI_Finalize(); + return(0); +} diff --git a/src/Panda/io_main.C b/src/Panda/io_main.C new file mode 100644 index 0000000..69b0a63 --- /dev/null +++ b/src/Panda/io_main.C @@ -0,0 +1,83 @@ +#include "definitions.h" +#include "StopWatch.h" +#include "Panda.h" +#include "ArrayGroup.h" + +extern MPIFS* MPIFS_global_obj; +extern int BRANCHING_FACTOR; +extern int SUBCHUNK_SIZE; +Boolean shared_flag = NO; + + +char my_getopt(char *str) +{ + char command[8][15]; + + strcpy(command[0], "-chunks"); + strcpy(command[1], "-xmax_messages"); + strcpy(command[2], "-tags"); + strcpy(command[3], "-branching_factor"); + strcpy(command[4], "-ymax_memory"); + strcpy(command[5], "-flag"); + strcpy(command[6], "-size_message"); + strcpy(command[7], "-Shared"); + + for (int i= 0; i< 8; i++) + if (!strncmp(str, command[i], 2)) return command[i][1]; + printf("undefined input %s, quit!\n",str); + exit(0); +} + +void parse_cl(int argc, char **argv) +{ + char opt; + + for(int i=1; i< argc; ){ + opt = my_getopt(argv[i++]); + switch(opt) { + case 'b' : + BRANCHING_FACTOR = atoi(argv[i++]); + break; + case 's': + SUBCHUNK_SIZE = atoi(argv[i++]); + break; + case 'S': + shared_flag = (Boolean) atoi(argv[i++]); + break; + } + } +} + +main(int argc, char **argv) +{ + int *world_ranks, my_rank, leader, app_size; + MPI_Init(&argc, &argv); + Panda *bear; + char cmd[100]; + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + sprintf(cmd , "rm -rf %s", FILEPREFIX); + //if (my_rank == 0) + system(cmd); + sprintf(cmd , "mkdir %s", FILEPREFIX); + //if (my_rank == 0) + system(cmd); + MPI_Comm_size(MPI_COMM_WORLD, &app_size); + world_ranks = (int *) malloc(sizeof(int)*app_size); + leader = 0; + + for(int i=0;i< app_size; i++) + world_ranks[i] = leader+i; + parse_cl(argc, argv); + if (shared_flag){ + bear = new Panda(IO_NODE, 0, my_rank, app_size, + world_ranks, YES); + } + else { + bear = new Panda(IO_NODE, 0, my_rank, app_size, world_ranks); + } + delete bear; +// sprintf(cmd , "rm -rf %s", FILEPREFIX); if (my_rank == 0) system(cmd); + MPI_Finalize(); +} + diff --git a/src/Panda/make.code.defn b/src/Panda/make.code.defn new file mode 100644 index 0000000..dd9edc3 --- /dev/null +++ b/src/Panda/make.code.defn @@ -0,0 +1,77 @@ +SRCS = App_Info.C Array.C ArrayDistribution.C ArrayLayout.C Attribute.C Chunk.C Collective_IO.C List.C MPIFS.C Panda.C Simple_IO.C Template.C VirtFS.C c_interface.C + +SUBDIRS = + +# The 9000 names of the cygwin tools and T3E... +TMPUN := $(shell uname) +ifeq ($(TMPUN), CYGWIN32_95) +UNAME = CYGWIN +else +ifeq ($(TMPUN), CYGWIN32_NT) +UNAME = CYGWIN +else +ifeq ($(TMPUN), CYGWIN_NT-4.0) +UNAME = CYGWIN +else +UNAME := $(shell uname | perl -pe 's/(sn\d\d\d\d|jsimpson)/UNICOS\/mk/') +endif +endif +endif + +# 64 Bit Irix +ifeq ($(UNAME), IRIX64) + +CXXFLAGS += -DANSI -DFILEPREFIX -ptused + +endif + +# 32 Bit Irix +ifeq ($(UNAME), IRIX) + +CXXFLAGS += -DANSI -ptused + +endif + +# HP +ifeq ($(UNAME), HP-UX) + +CXXFLAGS += -DANSI -DHP + +endif + +# Alpha +ifeq ($(UNAME), OSF1) + +CXXFLAGS += -DANSI + +endif + +# Linux +ifeq ($(UNAME), Linux) + +CXXFLAGS += -DANSI + +endif + +# Macintosh /PowerMach-MachTen +ifeq ($(UNAME), machten) + +CXXFLAGS += -DANSI + +endif + +# Cygwin / Win32 +ifeq ($(UNAME), CYGWIN) + +CFLAGS += -DANSI -DWIN32 +CXXFLAGS += -DANSI -DWIN32 + +endif + +# T3E +ifeq ($(UNAME), UNICOS/mk) + +CXXFLAGS += -DANSI -DT3E -hinstantiate=used + +endif + diff --git a/src/Panda/makefile.hpux.mpich b/src/Panda/makefile.hpux.mpich new file mode 100644 index 0000000..6ad74c5 --- /dev/null +++ b/src/Panda/makefile.hpux.mpich @@ -0,0 +1,19 @@ +# makefile part for hpux (yong 8/3/95) +include fileprefix +MPICH_HOME = /extra/ying/mpich +MPIRUN_HOME = /extra/ying/mpirun +INCLUDE_DIR = -I$(MPICH_HOME)/include -I$(MPIRUN_HOME)/include +WGEN_DIR = /extra/ying/mpich/profiling/wrappergen +LIBS = -L$(MPIRUN_HOME)/lib -lmpirun -L$(MPICH_HOME)/lib/$(ARCH)/$(COMM) -lmpi -lm -lV3 +#LIBS = -L$(MPIRUN_HOME)/lib -lmpirun -L$(MPICH_HOME)/lib/$(ARCH)/$(COMM) -lmpi -lpmpi -lm -lV3 +#MPILIB = $(MPICH_HOME)/lib/$(ARCH)/$(COMM)/libmpi.a +DEVICE = ch_p4 +COMM = ch_p4 +ARCH = hpux +AR = /usr/gnu/bin/ar # for aix, but also pretty standard +CC = gcc +OPTFLAGS = -g -Wall +CFLAGS = -DMPID_NO_FORTRAN -DHAS_XDR=1 \ + -DHAVE_STDLIB_H=1 \ + -DHAVE_SYSTEM=1 $(OPTFLAGS) $(INCLUDE_DIR) -DMPI_$(ARCH) \ + -DTARGETHPUX -DFILEPREFIX=$(FILEPREFIXVAL) diff --git a/src/Panda/makefile.ibm.mpif b/src/Panda/makefile.ibm.mpif new file mode 100644 index 0000000..a51b052 --- /dev/null +++ b/src/Panda/makefile.ibm.mpif @@ -0,0 +1,11 @@ +# makefile part for aix with our MPIFS filesystem on MPIF (jozwiak 030795) +include fileprefix +INCLUDE_DIR = -I/usr/local/include/ibm-mpi +LIBS = -lm -L/usr/local/lib/ibm-mpi -lmpirun +AR = /bin/ar # for aix +CC = mpCC +#CC = ./mpifxlC +OPTFLAGS = -g -DCOST_MODEL +CFLAGS = $(OPTFLAGS) $(INCLUDE_DIR) \ + -DTARGETAIX -DFILEPREFIX=$(FILEPREFIXVAL) \ + -DWRAPPERTEST -DNAS_MPIF diff --git a/src/Panda/makefile.proto b/src/Panda/makefile.proto new file mode 100644 index 0000000..17805b3 --- /dev/null +++ b/src/Panda/makefile.proto @@ -0,0 +1,96 @@ +# makefile on 3-7-95 for C++ version of panda + +# REMOVE # for the intended build (NOTE: # is a comment, unlike for C) +# include makefile.ibm.mpif +# include makefile.ibm.mpich # this one is flakey, use mpif +# include makefile.sun.mpich +# include makefile.unix.posix + +ARCHIVE = libeegads.a +OFILES = Array.o Chunk.o Simple_IO.o Panda.o \ + ArrayLayout.o List.o Collective_IO.o \ + MPIFS.o Attribute.o ArrayDistribution.o \ + Template.o VirtFS.o App_Info.o c_interface.o +CFILES = + +all: $(ARCHIVE) + +$(ARCHIVE): $(OFILES) + $(AR) crv $(ARCHIVE) $(OFILES) + +Array.o: Array.C Array.h Template.h List.h MPIFS.h ArrayLayout.h definitions.h + $(CC) $(CFLAGS) -c Array.C +ArrayGroup.o: ArrayGroup.C ArrayGroup.h ArrayGroup.h MPIFS.h definitions.h + $(CC) $(CFLAGS) -c ArrayGroup.C +List.o: List.C List.h definitions.h + $(CC) $(CFLAGS) -c List.C +ArrayLayout.o: ArrayLayout.C ArrayLayout.h Template.h definitions.h + $(CC) $(CFLAGS) -c ArrayLayout.C +Template.o: Template.C Template.h definitions.h + $(CC) $(CFLAGS) -c Template.C +VirtFS.o: VirtFS.C VirtFS.h + $(CC) -c $(CFLAGS) VirtFS.C +MPIFS.o: MPIFS.C MPIFS.h VirtFS.h Array.h Collective_IO.h Simple_IO.h definitions.h App_Info.h message.h + $(CC) -c $(CFLAGS) MPIFS.C +Panda.o: Panda.C Panda.h VirtFS.h MPIFS.h definitions.h + $(CC) -c $(CFLAGS) Panda.C +Chunk.o: Chunk.C Chunk.h ArrayLayout.h Array.h definitions.h + $(CC) -c $(CFLAGS) Chunk.C +Collective_IO.o: Collective_IO.C Collective_IO.h definitions.h + $(CC) -c $(CFLAGS) Collective_IO.C +Simple_IO.o: Simple_IO.C Simple_IO.h Collective_IO.h Array.h MPIFS.h definitions.h message.h + $(CC) -c $(CFLAGS) Simple_IO.C +CSDIO.o: CSDIO.C CSDIO.h Simple_IO.h Collective_IO.h Array.h MPIFS.h definitions.h message.h + $(CC) -c $(CFLAGS) CSDIO.C +Shared_IO.o: Shared_IO.C Shared_IO.h Simple_IO.h Collective_IO.h Array.h MPIFS.h definitions.h message.h + $(CC) -c $(CFLAGS) Shared_IO.C +CSDIO_Shared.o: CSDIO_Shared.C CSDIO_Shared.h CSDIO.h Simple_IO.h Collective_IO.h ArrayGroup.h Array.h MPIFS.h definitions.h message.h + $(CC) -c $(CFLAGS) CSDIO_Shared.C +App_Info.o: App_Info.C App_Info.h definitions.h + $(CC) -c $(CFLAGS) App_Info.C +c_interface.o: c_interface.C c_interface.h + $(CC) -c $(CFLAGS) c_interface.C +Attribute.o: Attribute.C Attribute.h + $(CC) -c $(CFLAGS) Attribute.C +ArrayDistribution.o: ArrayDistribution.C ArrayDistribution.h + $(CC) -c $(CFLAGS) ArrayDistribution.C + + +## Hey, Kent, how should we verify a build is indeed correct? +## it seems that there is sort of a chicken and egg problem +## here: we need a manually verified set of correct runs +## against which to test later builds and test runs... +## i set up the little bit below so that one can do a +## `make test' to verify a corrrect build ... + +oneexe: oneexe.C $(ARCHIVE) + $(CC) $(CFLAGS) oneexe.C -o oneexe -L. -leegads $(LIBS) + +io_main: io_main.C $(ARCHIVE) + $(CC) $(CFLAGS) io_main.C -o io_main -L. -leegads $(LIBS) + +compute_test: compute_test.C $(ARCHIVE) + $(CC) $(CFLAGS) compute_test.C -o compute_test -L. -leegads $(LIBS) + +part_test: part_test.C $(ARCHIVE) + $(CC) $(CFLAGS) part_test.C -o part_test -L. -leegads $(LIBS) + +shared_test: shared_test.C $(ARCHIVE) + $(CC) $(CFLAGS) shared_test.C -o shared_test -L. -leegads $(LIBS) + +cleantests: ; + - /bin/rm -f core + - /bin/rm -f $(TESTDIR) io_main compute_test part_test shared_test oneexe + sync + +clean: cleantests + - /bin/rm -f $(OFILES) $(ARCHIVE) +# - /bin/rm -f *~ PI* os-detected a.out mpi_test core *.o +# - /bin/rm -f mputil.mp*.c makefile fileprefix mpirun.* + - /bin/rm -f makefile fileprefix + sync + +configure: ; @echo "already configured, or this makefile wouldn't be here" + @echo "to reconfigure, make clean, then sh configure" + + diff --git a/src/Panda/makefile.sgi.mpich b/src/Panda/makefile.sgi.mpich new file mode 100644 index 0000000..f9071df --- /dev/null +++ b/src/Panda/makefile.sgi.mpich @@ -0,0 +1,10 @@ +# makefile part for aix with our MPIFS filesystem on MPIF (jozwiak 030795) +include fileprefix +Cactus_HOME = ../../.. +INCLUDE_DIR = -I/usr/include -I$(Cactus_HOME)/lib/IEEEIO +LIBS = -lmpi -L$(Cactus_HOME)/irix6/obj -lieeeio +AR = /usr/bin/ar # for aix +CC = CC +OPTFLAGS = -g +CFLAGS = $(OPTFLAGS) $(INCLUDE_DIR) \ + -DFILEPREFIX=$(FILEPREFIXVAL) diff --git a/src/Panda/makefile.sun.mpich b/src/Panda/makefile.sun.mpich new file mode 100644 index 0000000..4d00846 --- /dev/null +++ b/src/Panda/makefile.sun.mpich @@ -0,0 +1,18 @@ +# makefile part for bunny with our MPIFS filesystem on MPICH (jozwiak 030795) +include fileprefix +MPIR_HOME = /home2/panda/MPI/mpich +INCLUDE_DIR = -I$(MPIR_HOME)/include +LIBS = -L/home2/panda/MPI/mpich/lib/sun4/ch_p4 -lmpirun -lmpi -lm +MPILIB = $(MPIR_HOME)/lib/$(ARCH)/$(COMM)/libmpi.a +DEVICE = ch_p4 +COMM = ch_p4 +ARCH = sun4 +AR = /usr/5bin/ar # for sunos (bsd) +CC = gcc +OPTFLAGS = -g -Wall +CFLAGS = -DMPID_NO_FORTRAN -DHAS_XDR=1 \ + -DHAVE_STDLIB_H=1 -DNAS_MPIF\ + -DHAVE_SYSTEM=1 $(OPTFLAGS) $(INCLUDE_DIR) -DMPI_$(ARCH) \ + -DTARGETSUNOS -DFILEPREFIX=$(FILEPREFIXVAL) \ + -DWRAPPERTEST -DMPICH +# -DVERIFYBF -DDEBUG diff --git a/src/Panda/message.h b/src/Panda/message.h new file mode 100644 index 0000000..f76998f --- /dev/null +++ b/src/Panda/message.h @@ -0,0 +1,81 @@ +#ifndef message_dot_h +#define message_dot_h + + +inline void send_message(void *buf, int count, MPI_Datatype data_type, + int dest, int tag, MPI_Comm comm) +{ + MPI_Send(buf,count,data_type,dest,tag,comm); +#ifdef DEBUG + printf("Sending message to %d of size %d with tag %d\n", + dest, count, tag); +#endif +} + +inline void nb_send_message(void *buf, int count, MPI_Datatype data_type, + int dest, int tag, MPI_Comm comm, MPI_Request *request) +{ + MPI_Isend(buf,count,data_type,dest,tag,comm, request); +#ifdef DEBUG + printf("Sending nonblocking message to %d of size %d with tag %d\n", + dest, count, tag); +#endif +} + + +inline void receive_message(void *buf, int count, MPI_Datatype datatype, + int src, int tag, MPI_Comm comm, MPI_Status *status) +{ + MPI_Recv(buf,count,datatype, src,tag,comm,status); +#ifdef DEBUG + printf("Received message from %d of size %d with tag %d\n", + src, count, tag); +#endif +} + + +inline void nb_receive_message(void *buf, int count, MPI_Datatype datatype, + int src, int tag, MPI_Comm comm, MPI_Request *request) +{ + MPI_Irecv(buf,count,datatype, src,tag,comm,request); +#ifdef DEBUG + printf("Post a non-blocking receive for %d of size %d with tag %d\n", + src, count, tag); +#endif +} + + +inline void mpi_test(MPI_Request *request, int *flag, MPI_Status *status) +{ + MPI_Test(request, flag, status); +} + + +inline void mpi_get_count(MPI_Status *status, MPI_Datatype datatype, int *len) +{ + MPI_Get_count(status, datatype, len); +} + + +inline void any_new_message(int *msg_code, int *msg_src, + int *msg_tag,MPI_Status *msg_status) +{ + int flag; + + MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, msg_status); + if (!flag){ + *msg_code = NO_MESSAGE; + *msg_src = -1; + *msg_tag = -1; + return; + } + else{ + /* There some message waiting for us */ + *msg_tag = msg_status->MPI_TAG; + *msg_src = msg_status->MPI_SOURCE; + *msg_code = msg_status->MPI_TAG % 10; + return; + } +} + +#endif diff --git a/src/Panda/oneexe.C b/src/Panda/oneexe.C new file mode 100644 index 0000000..f9b6b07 --- /dev/null +++ b/src/Panda/oneexe.C @@ -0,0 +1,91 @@ +#include <stdio.h> +#include <stdlib.h> +#include "mpi.h" +#include "IO.h" +#include "c_interface.h" + +extern "C" { int Panda_Create(int, char **, int, int); } +extern "C" { void Panda_Finalize(); } +extern "C" { void Panda_WriteAttribute(char *, char *, int, int, void *); } +extern "C" { void *Panda_ReadAttribute(char *, char *, int *, int *); } +extern "C" { void PandaTimestep(struct ArrayInfo *); } +extern "C" { char *PandaReadTimestep(struct ArrayInfo *); } + +int main(int argc, char **argv) +{ + int my_rank, i, j, k; + ArrayInfo ainfo; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); +// Panda_Create(argc, argv, 2, 1); + if (Panda_Create(argc, argv, 2, 0)) { MPI_Finalize(); return 1; } + + // Timestep-write +/* int size[3] = {16, 4, 4}; + int mem_layout[3] = {2, 1, 1}; + Distribution mem_dist[3] = {BLOCK, BLOCK, BLOCK}; + int disk_layout[1] = {2}; + Distribution disk_dist[3] = {BLOCK, NONE, NONE}; + int *data = (int *)malloc(sizeof(int) * 128); + ainfo.name_ = "./panda.out"; + ainfo.rank_ = 3; + ainfo.size_ = size; + ainfo.esize_ = INT32; + ainfo.mem_rank_ = 3; + ainfo.mem_layout_ = mem_layout; + ainfo.mem_dist_ = mem_dist; + ainfo.disk_rank_ = 1; + ainfo.disk_layout_ = disk_layout; + ainfo.disk_dist_ = disk_dist; + ainfo.data_ = (char*)data; + + for (i=0; i<8; i++) + for (j=0; j<4; j++) + for (k=0; k<4; k++) data[i*16+j*4+k] = i*16+j*4+k + my_rank; + + ainfo.stencil_width_ = 0; + PandaTimestep(&ainfo); + Panda_WriteAttribute("./panda.out", "global_size", INT32, 3, size); + + printf("%d - ", my_rank); + for (i=0; i<8; i++) + for (j=0; j<4; j++) + for (k=0; k<4; k++) printf("%d ", data[i*16+j*4+k]); + printf("\n"); fflush(stdout); + free(data); */ + + // ReadTimeste-write + int mem_layout[3] = {2, 1, 1};; + Distribution mem_dist[3] = {BLOCK, BLOCK, BLOCK}; + ainfo.name_ = "./panda.out"; + ainfo.rank_ = 3; + ainfo.size_ = NULL; + ainfo.esize_ = 0; + ainfo.mem_rank_ = 3; + ainfo.mem_layout_ = mem_layout; + ainfo.mem_dist_ = mem_dist; + ainfo.disk_rank_ = 0; + ainfo.disk_layout_ = NULL; + ainfo.disk_dist_ = NULL; + ainfo.data_ = NULL; + + int *data = (int *)PandaReadTimestep(&ainfo); + + printf("%d - ", my_rank); + for (i=0; i<8; i++) + for (j=0; j<4; j++) + for (k=0; k<4; k++) printf("%d ", data[i*16+j*4+k]); + printf("\n"); fflush(stdout); + free(data); + + int type, count; + int *data1 = (int *)Panda_ReadAttribute("./panda.out", "global_size", + &type, &count); + printf("%d: data type %d, count %d, contents: ", my_rank, type, count); + for (i=0; i<count; i++) printf("%d ", data1[i]); + printf("\n"); + free(data1); + + Panda_Finalize(); + MPI_Finalize(); +} diff --git a/src/Panda/os-detected b/src/Panda/os-detected new file mode 100644 index 0000000..4f378d7 --- /dev/null +++ b/src/Panda/os-detected @@ -0,0 +1 @@ +irix diff --git a/src/Panda/part_test.C b/src/Panda/part_test.C new file mode 100644 index 0000000..03a7c2c --- /dev/null +++ b/src/Panda/part_test.C @@ -0,0 +1,385 @@ +/***************************************************************** + * This is a sample program that shows how the panda library * + * is going to be used by the application programs. * + * The example command line format is in test7.script. * + * This example shows the interface with only disk layout * + * info but no stride or subchunking schema. The value for * + * those schemas use the default ones. * + * The current test varies the size of arrays. However, the * + * wrapper function allows the number of the nodes to be * + * changed as well. * + * The first iteration loads all the code in memory. * + * The second run does the simulated disk simulation. * + * From the third run on, the values are the real writes. * + *****************************************************************/ + +#include <stdio.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include "definitions.h" +#include "StopWatch.h" +#include "ArrayGroup.h" +#include "ArrayLayout.h" +#include "Array.h" +#include "Panda.h" + +int Num_of_Arrays = 1; +int Num_Simulate_Read = 0; +int Num_Read = 0; +int Num_Simulate_Write = 2; +int Num_Write = 2 ; +int interleave = 0; +Panda *global_bear; +int world_rank; + +extern int BRANCHING_FACTOR; +extern int SUBCHUNK_SIZE; +int STRATEGY = 1; + +void test_timestep(ArrayGroup *t1, int arraysize, Array **arrays) +{ + StopWatch timer; + int i; + int flag=0; + char time_message[100]; + +#ifdef VERIFYBF + for (int j=0; j<Num_of_Arrays; j++) arrays[j]->set_byte_pattern(); + t1->set_verify(); +#endif + + + global_bear->cleanfiles(); + global_bear->createfiles(); + + for (i=0; i<Num_Simulate_Write+Num_Write; ++i) { + if (i < Num_Simulate_Write){ + t1->set_simulate_mode(); + flag=0; + } + else { + t1->reset_simulate_mode(); + flag=1; + global_bear->cleanfiles(); + global_bear->createfiles(); + } + + + global_bear->app_barrier(); + t1->set_io_strategy(STRATEGY); + timer.start(); + t1->timestep(); + timer.stop(":"); + sprintf(time_message,"%s Write: SIZE: %d, Time %i %s", + (flag==0? "Simulated":"Real"), + arraysize, i, timer.get_description()); + printf("%s", time_message); + + if (Num_Read + Num_Simulate_Read == 0 || i < Num_Simulate_Write + Num_Write-1 ) { + global_bear->cleanfiles(); + global_bear->createfiles(); + + } + } +} + +void test_readtimestep(ArrayGroup *r1, int arraysize, Array **arrays) +{ + StopWatch timer; + int i; + int flag; + char time_message[100]; +#ifdef VERIFYBF + for (int j=0; j<Num_of_Arrays; j++) arrays[j]->reset_byte_pattern(); +#endif + + if (Num_Write + Num_Simulate_Write == 0) { + global_bear->cleanfiles(); + global_bear->createfiles(); + } + + + + for (i=0; i<Num_Simulate_Read+Num_Read; ++i) { + if (i < Num_Simulate_Read) { r1->set_simulate_mode(); flag=0; } + else {r1->reset_simulate_mode(); + flag=1; + global_bear->flushfiles(); + } + + + global_bear->app_barrier(); + r1->set_io_strategy(STRATEGY); + timer.start(); + r1->restart(); + timer.stop(":"); + + sprintf(time_message,"%s Read: SIZE: %d, Time %i %s ", + (flag==0? "Simulated":"Real"), + arraysize, i, timer.get_description()); + printf("%s", time_message); + + } +#ifdef VERIFYBF + for(i=0;i<Num_of_Arrays;i++) + if (arrays[i]->verify_byte_pattern()) + printf("%d:Byte pattern verified for array %d\n", world_rank, i); + else + printf("%d:Byte pattern incorrect for array %d\n",world_rank,i); +#endif + global_bear->cleanfiles(); +} + + +int gemein(Panda *bear, int io_nodes, int arrayrank, int *arraysize, int esize, + int mrank, int *mlayout, int drank, int *dlayout, + Distribution *mem_dist, Distribution *disk_dist, int cost_model) +{ + ArrayLayout *mem1; // Memory array layout + ArrayLayout *disk1; // Disk array layout + int i; + Array **arrays; + arrays = (Array **)malloc(sizeof(Array*)*Num_of_Arrays); + +// Set up memory and disk layouts + mem1 = new ArrayLayout (mrank,mlayout); + disk1 = new ArrayLayout(drank,dlayout); + +// Create an Array for computation. + char *name; + name = (char *)malloc(sizeof(char)*(strlen("z1Array")+5)); + char temp[5]; + for (i=0; i< Num_of_Arrays; i++) { + strcpy(name,"z1Array"); + sprintf(temp, "%d", i); + strcat(name, temp); + arrays[i] = new Array(name,arrayrank,arraysize,esize, + mem1,mem_dist,disk1, disk_dist); + } + free(name); + + if (Num_Simulate_Write + Num_Write > 0) { + ArrayGroup *t1 = new ArrayGroup("z4timestep"); + for (i= 0; i<Num_of_Arrays; i++) t1->insert(arrays[i]); + test_timestep(t1, arraysize[arrayrank-1], arrays); + delete t1; + if (Num_Simulate_Read + Num_Read > 0) { + ArrayGroup *r1 = new ArrayGroup("z4timestep"); + for (i= 0; i<Num_of_Arrays; i++) r1->insert(arrays[i]); + test_readtimestep(r1, arraysize[arrayrank-1], arrays); + delete r1; + } + } else { + + ArrayGroup *r1 = new ArrayGroup("z4timestep"); + for (i= 0; i<Num_of_Arrays; i++) r1->insert(arrays[i]); + test_readtimestep(r1, arraysize[arrayrank-1], arrays); + delete r1; + } + + // delete all objects created + + for (i=0; i<Num_of_Arrays; i++) delete arrays[i]; + free(arrays); + delete disk1; + delete mem1; + return(0); +} + +char my_getopt(char *str) +{ + char command[25][15]; + + strcpy(command[0], "-Total_nodes"); + strcpy(command[1], "-Io_nodes"); + strcpy(command[2], "-upper"); + strcpy(command[3], "-Arraysize"); + strcpy(command[4], "-Esize"); + strcpy(command[5], "-Mlayout"); + strcpy(command[6], "-Dlayout"); + strcpy(command[7], "-mem_dist"); + strcpy(command[8], "-disk_dist"); + strcpy(command[9], "-num_arrays"); + strcpy(command[10], "-read_simulate"); + strcpy(command[11], "-Read"); + strcpy(command[12], "-write_simulate"); + strcpy(command[13], "-Write"); + strcpy(command[14], "-interleave"); + strcpy(command[15], "-Cost_model"); + strcpy(command[16], "-chunks"); + strcpy(command[17], "-xmax_messages"); + strcpy(command[18], "-tags"); + strcpy(command[19], "-branching_factor"); + strcpy(command[20], "-ymax_memory"); + strcpy(command[21], "-flag"); + strcpy(command[22], "-size_message"); + strcpy(command[23], "-Xfactor"); + strcpy(command[24], "-Optimize"); + + for (int i= 0; i< 25; i++) + if (!strncmp(str, command[i], 2)) return command[i][1]; + printf("undefined input %s, quit!\n",str); + return NULL; +} + +void parse_cl(int argc, char **argv, int &total_nodes, int &io_nodes, + int &upper_bound, int &lower_bound, int &arrayrank, int*& arraysize, + int &esize, int &mrank, int*& mlayout, int& drank, int*& dlayout, + Distribution*& mem_dist, Distribution*& disk_dist, int &cost_model_mode) +{ + char opt; + int k; + + for (int i=1; i<argc; ) { + opt = my_getopt(argv[i++]); + switch(opt) + { + case 'X': + STRATEGY = atoi(argv[i++]); + break; + case 'T': + total_nodes = atoi(argv[i++]); + break; + case 'I': + io_nodes = atoi(argv[i++]); + break; + case 'u': + upper_bound = atoi(argv[i++]); + break; + case 'A': + arrayrank = atoi(argv[i++]); + arraysize = (int *) malloc(sizeof(int)* arrayrank); + mem_dist = (Distribution *)malloc(sizeof(Distribution)*arrayrank); + disk_dist = (Distribution *)malloc(sizeof(Distribution)*arrayrank); + for (k = 0; k < arrayrank; k++) arraysize[k] = atoi(argv[i++]); + lower_bound = arraysize[k-1]; + break; + case 'E': + esize = atoi(argv[i++]); + break; + case 'M': + mrank = atoi(argv[i++]); + mlayout = (int *) malloc(sizeof(int)* mrank); + for (k = 0; k < mrank; k++) mlayout[k] = atoi(argv[i++]); + break; + case 'D': + drank = atoi(argv[i++]); + dlayout = (int *) malloc(sizeof(int)* drank); + for (k = 0; k < drank; k++) dlayout[k] = atoi(argv[i++]); + break; + case 'm': + for (k = 0; k < arrayrank; k++) mem_dist[k] = (Distribution)atoi(argv[i++]); + break; + case 'd': + for (k = 0; k < arrayrank; k++) disk_dist[k] = (Distribution)atoi(argv[i++]); + break; + case 'n': + Num_of_Arrays = atoi(argv[i++]); + break; + case 'r': + Num_Simulate_Read = atoi(argv[i++]); + break; + case 'R': + Num_Read = atoi(argv[i++]); + break; + case 'w': + Num_Simulate_Write = atoi(argv[i++]); + break; + case 'W': + Num_Write = atoi(argv[i++]); + break; + case 'i': + interleave = atoi(argv[i++]); + break; + case 'C': + cost_model_mode = atoi(argv[i++]); + break; + case 'b' : + BRANCHING_FACTOR = atoi(argv[i++]); + break; + case 's': + SUBCHUNK_SIZE = atoi(argv[i++]); + break; + } + } +} + + +int main(int argc, char **argv) +{ + int total_nodes; // The number of total nodes (comp + io) + int io_nodes; // The number of io nodes + int upper_bound; // The upper bound of the last dimension of the array + int lower_bound; // The starting number of the last dimension of the array + int arrayrank ; // The array rank. + int *arraysize; // The number of elements along each array dimention + int esize ; // element size of each array element + int mrank ; // Compute node mesh rank + int *mlayout; // Compute node mesh layout + int drank ; // IO node mesh rank + int cost_model_mode; // Whether the cost model is included. + int *dlayout; // IO node mesh layout + Distribution *mem_dist; // The memory array distribution along each dimention + // There are three possible distributions (BLOCK, + // NONE, CYCLIC). + Distribution *disk_dist; // The disk array distribution along each dimention + int my_rank, my_app_size, *world_ranks, leader; + char sys_command[100]; + + MPI_Init(&argc, &argv); + +// For Parallel architecture (IBM SP2 like), +// Initialize the MPI environment. Only compute nodes will return from +// this call, the io nodes will not return from the call. All the io nodes + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &my_app_size); + leader = 0; + world_ranks = (int *) malloc(sizeof(int)*my_app_size); + for(int i=0;i< my_app_size; i++) + world_ranks[i] = leader+i; + + + + Panda *bear; + int my_io_rank = my_rank; + int *io_ranks; + + parse_cl(argc, argv, total_nodes, io_nodes, upper_bound, lower_bound, + arrayrank, arraysize, esize, mrank, mlayout, drank, dlayout, + mem_dist, disk_dist, cost_model_mode); + + io_ranks = world_ranks; + + + if (my_io_rank<io_nodes) + { + global_bear = new Panda(PART_TIME_IO, my_rank, my_app_size, world_ranks, + my_io_rank, io_nodes, io_ranks); + bear = global_bear; + } + else + { + global_bear = new Panda(PART_TIME_COMPUTE, my_rank, my_app_size, world_ranks, + -1, io_nodes, io_ranks); + bear = global_bear; + } + for (int size=lower_bound; size <= upper_bound; size*=2) { + arraysize[arrayrank-1] = size; + gemein(bear,io_nodes, arrayrank, arraysize, esize, + mrank, mlayout, drank, dlayout, mem_dist, + disk_dist, cost_model_mode); + } + + free(mlayout); + free(dlayout); + free(mem_dist); + free(disk_dist); + free(world_ranks); + delete bear; + + MPI_Finalize(); + return(0); +} diff --git a/src/Panda/shared_test.C b/src/Panda/shared_test.C new file mode 100644 index 0000000..00ebaa1 --- /dev/null +++ b/src/Panda/shared_test.C @@ -0,0 +1,353 @@ +/***************************************************************** + * This is a sample program that shows how the panda library * + * is going to be used by the application programs. * + * The example command line format is in test7.script. * + * This example shows the interface with only disk layout * + * info but no stride or subchunking schema. The value for * + * those schemas use the default ones. * + * The current test varies the size of arrays. However, the * + * wrapper function allows the number of the nodes to be * + * changed as well. * + * The first iteration loads all the code in memory. * + * The second run does the simulated disk simulation. * + * From the third run on, the values are the real writes. * + *****************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include "definitions.h" +#include "StopWatch.h" +#include "ArrayGroup.h" +#include "ArrayLayout.h" +#include "Array.h" +#include "Panda.h" +#include "mpirun.h" + +int Num_of_Arrays = 1; +int Num_Simulate_Read = 0; +int Num_Read = 0; +int Num_Simulate_Write = 2; +int Num_Write = 2 ; +int interleave = 0; +Panda *global_bear; + +extern int BRANCHING_FACTOR; +extern int SUBCHUNK_SIZE; +int STRATEGY = 1; + +void test_timestep(ArrayGroup *t1, int arraysize, Array **arrays) +{ + StopWatch timer; + int i; + int flag=0; + char time_message[100]; + +#ifdef VERIFYBF + for (int j=0; j<Num_of_Arrays; j++) arrays[j]->set_byte_pattern(); + t1->set_verify(); +#endif + + + global_bear->cleanfiles(); + global_bear->createfiles(); + + + for (i=0; i<Num_Simulate_Write+Num_Write; ++i) { + if (i < Num_Simulate_Write){ + t1->set_simulate_mode(); + flag=0; + } + else { + t1->reset_simulate_mode(); + flag=1; + } + + + global_bear->global_barrier(); + t1->set_io_strategy(STRATEGY); + timer.start(); + t1->timestep(); + timer.stop(":"); + sprintf(time_message,"App_id %d: %s Write: SIZE: %d, Time %i %s", + MPIRUN_APP_ID, (flag==0? "Simulated":"Real"), + arraysize, i, timer.get_description()); + printf("%s", time_message); + + if (Num_Read + Num_Simulate_Read == 0 || i < Num_Simulate_Write + Num_Write-1 ) { + global_bear->cleanfiles(); + global_bear->createfiles(); + + } + } +} + +void test_readtimestep(ArrayGroup *r1, int arraysize, Array **arrays) +{ + StopWatch timer; + int i; + int flag; + char time_message[100]; +#ifdef VERIFYBF + for (int j=0; j<Num_of_Arrays; j++) arrays[j]->reset_byte_pattern(); +#endif + + if (Num_Write + Num_Simulate_Write == 0) { + global_bear->cleanfiles(); + global_bear->createfiles(); + } + + + + for (i=0; i<Num_Simulate_Read+Num_Read; ++i) { + if (i < Num_Simulate_Read) { r1->set_simulate_mode(); flag=0; } + else {r1->reset_simulate_mode(); + flag=1; + global_bear->flushfiles(); + } + + + global_bear->global_barrier(); + r1->set_io_strategy(STRATEGY); + timer.start(); + r1->read_timestep(); + timer.stop(":"); + + sprintf(time_message,"App_id %d: %s Read: SIZE: %d, Time %i %s ", + MPIRUN_APP_ID, (flag==0? "Simulated":"Real"), + arraysize, i, timer.get_description()); + printf("%s", time_message); + } +#ifdef VERIFYBF + for(i=0;i<Num_of_Arrays;i++) + if (arrays[i]->verify_byte_pattern()) + printf("Byte pattern verified for array %d\n", i); + else + printf("Byte pattern incorrect for array %d\n", i); +#endif + global_bear->cleanfiles(); +} + + +int gemein(Panda *bear, int io_nodes, int arrayrank, int *arraysize, int esize, + int mrank, int *mlayout, int drank, int *dlayout, + Distribution *mem_dist, Distribution *disk_dist, int cost_model) +{ + ArrayLayout *mem1; // Memory array layout + ArrayLayout *disk1; // Disk array layout + int i; + Array **arrays; + arrays = (Array **)malloc(sizeof(Array*)*Num_of_Arrays); + +// Set up memory and disk layouts + mem1 = new ArrayLayout (mrank,mlayout); + disk1 = new ArrayLayout(drank,dlayout); + +// Create an Array for computation. + char *name; + name = (char *)malloc(sizeof(char)*(strlen("z1Array")+5)); + char temp[5]; + for (i=0; i< Num_of_Arrays; i++) { + strcpy(name,"z1Array"); + sprintf(temp, "%d", i); + strcat(name, temp); + arrays[i] = new Array(name,arrayrank,arraysize,esize, + mem1,mem_dist,disk1, disk_dist); + } + free(name); + + if (Num_Simulate_Write + Num_Write > 0) { + ArrayGroup *t1 = new ArrayGroup("z4timestep"); + for (i= 0; i<Num_of_Arrays; i++) t1->insert(arrays[i]); + test_timestep(t1, arraysize[arrayrank-1], arrays); + delete t1; + if (Num_Simulate_Read + Num_Read > 0) { + ArrayGroup *r1 = new ArrayGroup("z4timestep"); + for (i= 0; i<Num_of_Arrays; i++) r1->insert(arrays[i]); + test_readtimestep(r1, arraysize[arrayrank-1], arrays); + delete r1; + } + } else { + + ArrayGroup *r1 = new ArrayGroup("z4timestep"); + for (i= 0; i<Num_of_Arrays; i++) r1->insert(arrays[i]); + test_readtimestep(r1, arraysize[arrayrank-1], arrays); + delete r1; + } + + // delete all objects created + + for (i=0; i<Num_of_Arrays; i++) delete arrays[i]; + free(arrays); + delete disk1; + delete mem1; + return(0); +} + +char my_getopt(char *str) +{ + char command[18][15]; + + strcpy(command[0], "-Total_nodes"); + strcpy(command[1], "-Io_nodes"); + strcpy(command[2], "-upper"); + strcpy(command[3], "-Arraysize"); + strcpy(command[4], "-Esize"); + strcpy(command[5], "-Mlayout"); + strcpy(command[6], "-Dlayout"); + strcpy(command[7], "-mem_dist"); + strcpy(command[8], "-disk_dist"); + strcpy(command[9], "-num_arrays"); + strcpy(command[10], "-read_simulate"); + strcpy(command[11], "-Read"); + strcpy(command[12], "-write_simulate"); + strcpy(command[13], "-Write"); + strcpy(command[14], "-interleave"); + strcpy(command[15], "-Cost_model"); + strcpy(command[16], "-size_message"); + strcpy(command[17], "-Xfactor"); + + for (int i= 0; i< 18; i++) + if (!strncmp(str, command[i], 2)) return command[i][1]; + printf("undefined input %s, quit!\n",str); + exit(0); +} + +void parse_cl(int argc, char **argv, int &total_nodes, int &io_nodes, + int &upper_bound, int &lower_bound, int &arrayrank, int*& arraysize, + int &esize, int &mrank, int*& mlayout, int& drank, int*& dlayout, + Distribution*& mem_dist, Distribution*& disk_dist, int &cost_model_mode) +{ + char opt; + int k; + + for (int i=1; i<argc; ) { + opt = my_getopt(argv[i++]); + switch(opt) { + case 'X': + STRATEGY = atoi(argv[i++]); + break; + case 's': + SUBCHUNK_SIZE = atoi(argv[i++]); + break; + case 'T': + total_nodes = atoi(argv[i++]); + break; + case 'I': + io_nodes = atoi(argv[i++]); + break; + case 'u': + upper_bound = atoi(argv[i++]); + break; + case 'A': + arrayrank = atoi(argv[i++]); + arraysize = (int *) malloc(sizeof(int)* arrayrank); + mem_dist = (Distribution *)malloc(sizeof(Distribution)*arrayrank); + disk_dist = (Distribution *)malloc(sizeof(Distribution)*arrayrank); + for (k = 0; k < arrayrank; k++) arraysize[k] = atoi(argv[i++]); + lower_bound = arraysize[k-1]; + break; + case 'E': + esize = atoi(argv[i++]); + break; + case 'M': + mrank = atoi(argv[i++]); + mlayout = (int *) malloc(sizeof(int)* mrank); + for (k = 0; k < mrank; k++) mlayout[k] = atoi(argv[i++]); + break; + case 'D': + drank = atoi(argv[i++]); + dlayout = (int *) malloc(sizeof(int)* drank); + for (k = 0; k < drank; k++) dlayout[k] = atoi(argv[i++]); + break; + case 'm': + for (k = 0; k < arrayrank; k++) mem_dist[k] = (Distribution)atoi(argv[i++]); + break; + case 'd': + for (k = 0; k < arrayrank; k++) disk_dist[k] = (Distribution)atoi(argv[i++]); + break; + case 'n': + Num_of_Arrays = atoi(argv[i++]); + break; + case 'r': + Num_Simulate_Read = atoi(argv[i++]); + break; + case 'R': + Num_Read = atoi(argv[i++]); + break; + case 'w': + Num_Simulate_Write = atoi(argv[i++]); + break; + case 'W': + Num_Write = atoi(argv[i++]); + break; + case 'i': + interleave = atoi(argv[i++]); + break; + case 'C': + cost_model_mode = atoi(argv[i++]); + break; + } + } +} + +int main(int argc, char **argv) +{ + int total_nodes; // The number of total nodes (comp + io) + int io_nodes; // The number of io nodes + int upper_bound; // The upper bound of the last dimension of the array + int lower_bound; // The starting number of the last dimension of the array + int arrayrank ; // The array rank. + int *arraysize; // The number of elements along each array dimention + int esize ; // element size of each array element + int mrank ; // Compute node mesh rank + int *mlayout; // Compute node mesh layout + int drank ; // IO node mesh rank + int cost_model_mode; // Whether the cost model is included. + int *dlayout; // IO node mesh layout + Distribution *mem_dist; // The memory array distribution along each dimention + // There are three possible distributions (BLOCK, + // NONE, CYCLIC). + Distribution *disk_dist; // The disk array distribution along each dimention + int my_rank, my_app_size, *world_ranks, leader; + + + MPI_Init(&argc, &argv); + MPIRUN_Init(&argc, &argv); + +// For Parallel architecture (IBM SP2 like), +// Initialize the MPI environment. Only compute nodes will return from +// this call, the io nodes will not return from the call. All the io nodes + + MPI_Comm_rank(MPIRUN_APP_COMM, &my_rank); + MPI_Comm_size(MPIRUN_APP_COMM, &my_app_size); + leader = MPIRUN_APP_LEADERS[MPIRUN_APP_ID]; + world_ranks = (int *) malloc(sizeof(int)*my_app_size); + for(int i=0;i< my_app_size; i++) + world_ranks[i] = leader+i; + printf("MPIRUN_APP_ID = %d\n", MPIRUN_APP_ID); + Panda * bear = new Panda(COMPUTE_NODE, MPIRUN_APP_ID, my_rank, my_app_size, + world_ranks); + global_bear = bear; + + + + + parse_cl(argc, argv, total_nodes, io_nodes, upper_bound, lower_bound, arrayrank, + arraysize, esize, mrank, mlayout, drank, dlayout, mem_dist, disk_dist, cost_model_mode); + for (int size=lower_bound; size <= upper_bound; size*=2) { + arraysize[arrayrank-1] = size; + gemein(bear,io_nodes, arrayrank, arraysize, esize, + mrank, mlayout, drank, dlayout, mem_dist, disk_dist, cost_model_mode); + } + free(arraysize); + free(mlayout); + free(dlayout); + free(mem_dist); + free(disk_dist); + free(world_ranks); + delete bear; + MPI_Finalize(); + return(0); +} diff --git a/src/Startup.c b/src/Startup.c new file mode 100644 index 0000000..500c6bd --- /dev/null +++ b/src/Startup.c @@ -0,0 +1,77 @@ + /*@@ + @file Startup.c + @date 01 Oct 1999 + @author Jonghyun Lee + @desc Startup routines for IOPanda. + @enddesc + @history + @endhistory + @@*/ + +#include <stdio.h> +#include <string.h> + +#include "cctk.h" +#include "cctk_Flesh.h" +#include "cctk_GHExtensions.h" +#include "cctk_parameters.h" +#include "CactusBase/IOUtil/src/ioGH.h" + +/* prototypes of functions to be registered */ +int IOPanda_Output3DGH (cGH *GH); +int IOPanda_TriggerOutput3D (cGH *GH, int); +int IOPanda_TimeFor3D (cGH *GH, int); +int IOPanda_Output3DVarAs (cGH *GH, const char *var, const char *alias); +void *IOPanda_SetupGH (tFleshConfig *config, int convergence_level, cGH *GH); +int IOPanda_InitGH (cGH *GH); +int IOPanda_RecoverGH (cGH *GH, const char *basename, int called_from); + + //void Panda_Create(int, int); +void Panda_Finalize(void); + + /*@@ + @routine IOPanda_Startup + @date Fri May 21 1999 + @author Thomas Radke + @desc + The startup registration routine for IOPanda. + Registers the GH extensions needed for IOPanda and + the registerable routines used for each method of IOPanda. + IOPanda does not overload any functions. + @enddesc + @calls + @calledby + @history + + @endhistory + +@@*/ +void IOPanda_Startup (void) +{ + int IO_GHExtension; + int IOMethod; + + IO_GHExtension = CCTK_RegisterGHExtension ("IOPanda"); + CCTK_RegisterGHExtensionSetupGH (IO_GHExtension, IOPanda_SetupGH); + CCTK_RegisterGHExtensionInitGH (IO_GHExtension, IOPanda_InitGH); + + /* Register the 3D IOPandaIO routines as output methods */ + IOMethod = CCTK_RegisterIOMethod ("IOPandaIO_3D"); + CCTK_RegisterIOMethodOutputGH (IOMethod, IOPanda_Output3DGH); + CCTK_RegisterIOMethodOutputVarAs (IOMethod, IOPanda_Output3DVarAs); + CCTK_RegisterIOMethodTimeToOutput (IOMethod, IOPanda_TimeFor3D); + CCTK_RegisterIOMethodTriggerOutput (IOMethod, IOPanda_TriggerOutput3D); + +#if 0 + /* Register the IOPanda recovery routine to thorn IOUtil */ + if (IOUtil_RegisterRecover ("IOPanda recovery", IOPanda_RecoverGH) < 0) + CCTK_WARN (1, "Failed to register IOPanda recovery routine"); + Panda_Create(1, 1); +#endif + +} + +void IOPanda_Finalize(void) +{ + Panda_Finalize(); +} diff --git a/src/ioPandaGH.h b/src/ioPandaGH.h new file mode 100644 index 0000000..f4da8d7 --- /dev/null +++ b/src/ioPandaGH.h @@ -0,0 +1,32 @@ + /*@@ + @header ioPandaGH.h + @date 01 Oct 1999 + @author Jonghyun Lee + @desc The extensions to the GH structure from IOPanda. + @history + @endhistory + @@*/ + +#include <string.h> + +#include "StoreNamedData.h" + + +typedef struct IOPandaGH { + + /* The number of times output */ + int *IO_3Dnum; + + /* How often to output */ + int IO_3Devery; + + /* Directory in which to output */ + char *outpfx_3D; + + /* The last iteration output */ + int *IO_3Dlast; + + /* filename database for opened files */ + pNamedData *fileList_3D; + +} pandaGH; diff --git a/src/make.code.defn b/src/make.code.defn new file mode 100644 index 0000000..174c78b --- /dev/null +++ b/src/make.code.defn @@ -0,0 +1,3 @@ +SRCS = Startup.c GHExtension.c Output3D.c DumpVar.c + +SUBDIRS = Panda diff --git a/src/make.configuration.defn b/src/make.configuration.defn new file mode 100644 index 0000000..64d3f73 --- /dev/null +++ b/src/make.configuration.defn @@ -0,0 +1,21 @@ +# make.configuration.defn for IOPanda + +# make sure that IOPanda was configured in with MPI and IEEEIO + +ifeq ($(strip $(HAVE_IEEEIO)), ) +$(NAME): MissingIEEEIO +.pseudo: MissingIEEEIO +MissingIEEEIO: + @echo "IOPanda: requires IEEEIO" + @echo "IOPanda: Please configure Cactus with thorn external/IEEEIO or remove IOPanda from Thornlist !" + exit 2 +endif + +ifeq ($(strip $(MPI_LIBS)), ) +$(NAME): MissingMPI +.pseudo: MissingMPI +MissingMPI: + @echo "IOPanda: requires MPI" + @echo "IOPanda: Please configure Cactus with MPI or remove IOPanda from Thornlist !" + exit 2 +endif |