#include #include "vatoi.hh" #include "IEEEIO.hh" #include "MPIutils.hh" #include "MPIO.hh" #include "Timer.hh" int StupidProcLayoutSeive(int np,int decomp[3]){ // Brute-Force method // stupidly attempts all decomposition combinations // and filters out unusable combinations. // trys to select minimum surface area processor arrangement int minarea = np*np*np; // impossibly high value for(int i=1;i<=np;i++){ for(int j=1;i*j<=np;j++){ for(int k=1;i*j*k<=np;k++){ if(i*j*k!=np) continue; int area = i+j+k; // area metric (optimizing this weakly minimizes surface area) if(arearank()) usage(); } } void usage(){ printf("Usage: %s <-verbose> <-dims int,int,int> <-dimtype global|local>", programname); printf("\tverbose: Opens file for reading after complete and prints all values.\n"); printf("\tdims: Set the dimensions of the dataset. By default this is per-cpu.\n"); printf("\t\tThis can be Global dimensions if you use -dimtype flag.\n"); printf("\t\tDefault is 20,20,20.\n"); printf("\tdimtype: \'global\' makes dimensions apply to the overall dataset\n"); printf("\t\tregardless of number of processors and \'local\' is per-processor.\n"); printf("\t\tThe default is per-processor.\n"); } void printStatus(FILE *f=stdout){ fprintf(f,"%s: verbose=%u dims=%u,%u,%u dimtype=%s\n", programname,verbose, dims[0],dims[1],dims[2], (dimtype==Local)?"Local":"Global"); fprintf(f,"\t Buffer Cache is at %u bytes\n",bufsize); } }; int main(int argc,char *argv[]){ MPIenv env(argc,argv); // start MPI threads MPIcomm *comm = env.getComm(); CommandLine cmdln(argc,argv,comm); int origin[3],dims[3]; int proclayout[3],area; int nghost=1; area=StupidProcLayoutSeive(comm->numProcs(),proclayout); // if(!comm->rank()) { cmdln.printStatus(); fprintf(stderr,"Proclayout = %u : %u,%u,%u\n",area, proclayout[0],proclayout[1],proclayout[2]); //} for(int i=0;i<3;i++) dims[i]=cmdln.dims[i]; if(cmdln.dimtype==CommandLine::Global) for(int i=0;i<3;i++) dims[i]/=proclayout[i]; // create dims for layout computation for(int p=0,k=0;krank()){ // assumes same dims per process origin[0]=dims[0]*i; origin[1]=dims[1]*j; origin[2]=dims[2]*k; for(int n=0;n<3;n++){ if(origin[n]-nghost>0) {origin[n]-=nghost; dims[n]+=(nghost);} dims[n]+=nghost; } if(cmdln.dimtype==CommandLine::Global){ // compute remainder for globaldims if(k==(proclayout[2]-1)) dims[2]=cmdln.dims[2]-origin[2]; if(j==(proclayout[1]-1)) dims[1]=cmdln.dims[1]-origin[1]; if(i==(proclayout[0]-1)) dims[0]=cmdln.dims[0]-origin[0]; } } } } } fprintf(stderr,"PE(%u) origin{%u,%u,%u} localdims{%u,%u,%u} nghost=%u\n", comm->rank(),origin[0],origin[1],origin[2], dims[0],dims[1],dims[2],nghost); int size=1; for(int i=0;i<3;i++) size*=dims[i]; // Everybody allocates their own local data float *data = new float[size]; for(int i=0;irank()){ // only PE0 needs to open a file IEEEIO *iofile; file = iofile = new IEEEIO("mpdata.raw",IObase::Write); if(cmdln.bufsize>0) iofile->bufferOn(cmdln.bufsize); } else file=0; // else it is null MPIO *pfile = new MPIO(file,*comm); // but everyone creates the pfile pfile->setLocalDims(3,origin,dims); // if(cmdln.syncsends) pfile->useSyncrhonousSend(); // puts("Write"); comm->barrier(); double stime = MPI_Wtime(); // start timing //Timer t; // t.start(); for(int i=0;irank()==0) fprintf(stderr,"Trial %4u\n",i); comm->barrier(); pfile->write(data); } comm->barrier(); // synchronize completion double etime = MPI_Wtime(); // t.end(); if(!comm->rank()){ printf("\nElapsed time to write=%lf\n",etime-stime); } puts("done writing"); delete pfile; if(file) delete file; delete data; if(!comm->rank()){ file = new IEEEIO("mpdata.raw",IObase::Read); int ndims[3],nrank; IObase::DataType dt; file->readInfo(dt,nrank,ndims); printf("IO performance = %f Megabytes/sec\n", (float)(cmdln.ntrials)*(float)(IObase::nBytes(dt,nrank,ndims))/(1024*1024*(etime-stime))); printf("Read mpdata.raw: rank=%u dims=%u,%u,%u\n",nrank,ndims[0],ndims[1],ndims[2]); if(cmdln.verbose){ // dangerous on T3E data = new float[IObase::nElements(nrank,ndims)]; file->read(data); for(int k=0,p=0;k