#include "grid_remote_connector.hpp" #include "client_client_dht_template.hpp" #include "leader_process.hpp" #include "mpi.hpp" namespace xios { /** * \brief class constructor. * \param srcView List of sources views. * \param dstView List of remotes views. * \param localComm Local MPI communicator * \param remoteSize Size of the remote communicator */ CGridRemoteConnector::CGridRemoteConnector(vector>& srcView, vector>& dstView, MPI_Comm localComm, int remoteSize) : srcView_(srcView), dstView_(dstView), localComm_(localComm), remoteSize_(remoteSize) {} /** * \brief class constructor. * \param srcView List of sources views. * \param dstView List of remotes views. * \param localComm Local MPI communicator * \param remoteSize Size of the remote communicator */ CGridRemoteConnector::CGridRemoteConnector(vector>& srcView, vector< shared_ptr >& dstView, MPI_Comm localComm, int remoteSize) : srcView_(srcView), localComm_(localComm), remoteSize_(remoteSize) { for(auto& it : dstView) dstView_.push_back((shared_ptr) it) ; } /** * \brief Compute if each view composing the source grid and the remote grid is distributed or not. * Result is stored on internal attributes \b isSrcViewDistributed_ and \b isDstViewDistributed_. * \detail To compute this, a hash is computed for each array on indices. The hash must permutable, i.e. * the order of the list of global indices doesn't influence the value of the hash. So simply a sum of * hash of each indices is used for the whole array. After, the computed hash are compared with each other * ranks of \b localComm_ MPI communicator using an MPI_ALLReduce. If, for each ranks, the hash is the same * then the view is not distributed */ void CGridRemoteConnector::computeViewDistribution(void) { HashXIOS hashGlobalIndex; // hash function-object int nDst = dstView_.size() ; vector hashRank(remoteSize_) ; isDstViewDistributed_.resize(nDst) ; for(int i=0; i> globalIndexView ; dstView_[i]->getGlobalIndexView(globalIndexView) ; hashRank.assign(remoteSize_,0) ; // everybody ranks to 0 except rank of the remote view I have // that would be assign to my local hash for(auto& it : globalIndexView) { int rank=it.first ; CArray& globalIndex = it.second ; size_t globalIndexSize = globalIndex.numElements(); size_t hashValue=0 ; for(size_t ind=0;ind globalIndex ; srcView_[i]->getGlobalIndexView(globalIndex) ; hashRank.assign(commSize,0) ; // 0 for everybody except my rank size_t globalIndexSize = globalIndex.numElements() ; size_t hashValue=0 ; for(size_t ind=0;ind> srcView ; vector> dstView ; vector indElements ; elements_.resize(srcView_.size()) ; bool srcViewsNonDistributed=true ; // not usefull now but later for optimization for(int i=0;i ranks ; for(auto& it : elements_[indElements[0]]) { if (it.second.numElements()==0) ranks[it.first] = false ; else ranks[it.first] = true ; } } /** * \brief Compute the connector, i.e. compute the \b elements_ attribute. * \detail In order to achive better optimisation, * we distingute the case when the grid is not distributed on source grid (\bcomputeSrcNonDistributed), * or the remote grid (\b computeDstNonDistributed), or the both (\b computeSrcDstNonDistributed). * Otherwise the generic method is called computeGenericMethod. Note that in the case, if one element view * is not distributed on the source and on the remote grid, then we can used the tensorial product * property to computing it independently using \b computeSrcDstNonDistributed method. * After that, we call the \b removeRedondantRanks method to supress blocks of data that can be sent * redondantly the the remote servers */ void CGridRemoteConnector::computeConnectorMethods(void) { vector> srcView ; vector> dstView ; vector indElements ; elements_.resize(srcView_.size()) ; bool srcViewsNonDistributed=true ; for(int i=0;i remoteRanks; list notUsed ; map ranks ; computeLeaderProcess(commRank, commSize, remoteSize_, remoteRanks, notUsed) ; for(int rank : remoteRanks) ranks[rank]=true ; for(int i=0; i ranks ; for(int i=0;i ranks ; for(auto& it : elements_[indElements[0]]) { if (it.second.numElements()==0) ranks[it.first] = false ; else ranks[it.first] = true ; } for(int i=0;i> globalIndexView ; dstView_[i]->getGlobalIndexView(globalIndexView) ; CClientClientDHTTemplate::Index2InfoTypeMap dataInfo; for(auto& it : globalIndexView) { auto& globalIndex=it.second ; for(size_t ind : globalIndex) dataInfo[ind]=it.first ; } // First we feed the distributed hash map with key (remote global index) // associated with the value of the remote rank CClientClientDHTTemplate DHT(dataInfo, localComm_) ; // after we feed the DHT with the local global indices of the source view int commRank, commSize ; MPI_Comm_rank(localComm_,&commRank) ; MPI_Comm_size(localComm_,&commSize) ; CArray srcIndex ; // like the source view is not distributed, then only the rank 0 need to feed the DHT if (commRank==0) srcView_[i]->getGlobalIndexView(srcIndex) ; // compute the mapping DHT.computeIndexInfoMapping(srcIndex) ; auto& returnInfo = DHT.getInfoIndexMap() ; // returnInfo contains now the map for each global indices to send to a list of remote rank // only for the rank=0 because it is the one to feed the DHT // so it need to send the list to each server leader i.e. the local process that handle specifically one or more // servers // rankIndGlo : rankIndGlo[rank][indGlo] : list of indice to send the the remote server of rank "rank" vector> rankIndGlo(remoteSize_) ; if (commRank==0) for(auto& it1 : returnInfo) for(auto& it2 : it1.second) rankIndGlo[it2].push_back(it1.first) ; vector requests ; if (commRank==0) { requests.resize(remoteSize_) ; for(int i=0 ; i remoteRanks; list notUsed ; // I am a server leader of which remote ranks ? computeLeaderProcess(commRank, commSize, remoteSize_, remoteRanks, notUsed) ; for(auto remoteRank : remoteRanks) { MPI_Status status ; int size ; MPI_Probe(0,remoteRank,localComm_, &status); MPI_Get_count(&status, MPI_SIZE_T, &size) ; elements_[i][remoteRank].resize(size) ; // for each remote ranks receive the global indices from proc 0 MPI_Recv(elements_[i][remoteRank].dataFirst(),size, MPI_SIZE_T,0,remoteRank, localComm_,&status) ; } if (commRank==0) { vector status(remoteSize_) ; // asynchronous for sender, wait for completion MPI_Waitall(remoteSize_, requests.data(), status.data()) ; } } /** * \brief Compute the remote connector for the element \b i when the remote view is not distributed. * After the call, element_[i] is defined. * \param i Indice of the element composing the remote grid. * \param ranks The list of rank for which the local proc is in charge to compute the connector * (if leader server for exemple). if ranks[rank] == false the corresponding elements_ * is set to void array (no data to sent) just in order to notify corresponding remote server * that the call is collective with each other one */ void CGridRemoteConnector::computeDstNonDistributed(int i, map& ranks) { auto& element = elements_[i] ; map> globalIndexView ; dstView_[i]->getGlobalIndexView(globalIndexView) ; CClientClientDHTTemplate::Index2InfoTypeMap dataInfo; // First we feed the distributed hash map with key (remote global index) // associated with the value of the remote rank for(auto& it : globalIndexView) if (it.first==0) // since the remote view is not distributed, insert only the remote rank 0 { auto& globalIndex=it.second ; for(size_t ind : globalIndex) dataInfo[ind]=0 ; // associated the the rank 0 } CClientClientDHTTemplate DHT(dataInfo, localComm_) ; // after we feed the DHT with the local global indices of the source view CArray srcIndex ; srcView_[i]->getGlobalIndexView(srcIndex) ; DHT.computeIndexInfoMapping(srcIndex) ; auto& returnInfo = DHT.getInfoIndexMap() ; // returnInfo contains now the map for each global indices to send to a list of remote rank // now construct the element_ list of global indices for each rank in my list except if the erray must be empty for (auto& rank : ranks) { if (rank.second) // non empty array => for rank that have not any data to be received { int size=0 ; for(auto& it : returnInfo) if (!it.second.empty()) size++ ; auto& array = element[rank.first] ; array.resize(size) ; size=0 ; for(auto& it : returnInfo) if (!it.second.empty()) { array(size)=it.first ; size++ ; } } else element[rank.first] = CArray(0) ; // empty array => for rank that have not any data to be received } } /** * \brief Compute the remote connector for the element \b i when the source and the remote view are not distributed. * After the call, element_[i] is defined. * \param i Indice of the element composing the remote grid. * \param ranks The list of rank for which the local proc is in charge to compute the connector * (if leader server for exemple). if ranks[rank] == false the corresponding elements_ * is set to void array (no data to sent) just in order to notify corresponding remote server * that the call is collective with each other one */ void CGridRemoteConnector::computeSrcDstNonDistributed(int i, map& ranks) { auto& element = elements_[i] ; map> globalIndexView ; dstView_[i]->getGlobalIndexView(globalIndexView) ; CClientClientDHTTemplate::Index2InfoTypeMap dataInfo; // First we feed the distributed hash map with key (remote global index) // associated with the value of the remote rank for(auto& it : globalIndexView) if (it.first==0) // insert only the remote rank 0 since the remote view is not distributed { auto& globalIndex=it.second ; for(size_t ind : globalIndex) dataInfo[ind]=0 ; // associated the the rank 0 } CClientClientDHTTemplate DHT(dataInfo, localComm_) ; // after we feed the DHT with the local global indices of the source view int commRank, commSize ; MPI_Comm_rank(localComm_,&commRank) ; MPI_Comm_size(localComm_,&commSize) ; CArray srcIndex ; // like the source view is not distributed, then only the rank 0 need to feed the DHT if (commRank==0) srcView_[i]->getGlobalIndexView(srcIndex) ; DHT.computeIndexInfoMapping(srcIndex) ; auto& returnInfo = DHT.getInfoIndexMap() ; vector indGlo ; if (commRank==0) for(auto& it1 : returnInfo) for(auto& it2 : it1.second) indGlo.push_back(it1.first) ; // now local rank 0 know which indices to seed to remote rank 0, but all the server // must receive the same information. So only the leader rank will sent this. // So local rank 0 must broadcast the information to all leader. // for this we create a new communicator composed of local process that must send data // to a remote rank, data are broadcasted, and element_[i] is construction for each remote // rank in charge int color=0 ; if (ranks.empty()) color=0 ; else color=1 ; if (commRank==0) color=1 ; MPI_Comm newComm ; MPI_Comm_split(localComm_, color, commRank, &newComm) ; if (color==1) { // ok, I am part of the process that must send something to one or more remote server // so I get the list of global indices from rank 0 int dataSize ; if (commRank==0) dataSize=indGlo.size() ; MPI_Bcast(&dataSize,1,MPI_INT, 0, newComm) ; indGlo.resize(dataSize) ; MPI_Bcast(indGlo.data(),dataSize,MPI_SIZE_T,0,newComm) ; } MPI_Comm_free(&newComm) ; // construct element_[i] from indGlo for(auto& rank : ranks) { if (rank.second) { int dataSize=indGlo.size(); auto& element = elements_[i][rank.first] ; element.resize(dataSize) ; for(int i=0;i(0) ; } } /** * \brief Generic method the compute the grid remote connector. Only distributed elements are specifed in the source view and remote view. * Connector for non distributed elements are computed separatly to improve performance and memory consumption. After the call, * \b elements_ is defined. * \param srcView List of the source views composing the grid, without non distributed views * \param dstView List of the remote views composing the grid, without non distributed views * \param indElements Index of the view making the correspondance between all views and views distributed (that are in input) */ void CGridRemoteConnector::computeGenericMethod(vector>& srcView, vector>& dstView, vector& indElements) { // generic method, every element can be distributed int nDst = dstView.size() ; vector dstSliceSize(nDst) ; dstSliceSize[0] = 1 ; for(int i=1; igetGlobalSize()*dstSliceSize[i-1] ; CClientClientDHTTemplate::Index2VectorInfoTypeMap dataInfo ; CClientClientDHTTemplate::Index2VectorInfoTypeMap info ; // info map // first, we need to feed the DHT with the global index of the remote server // for that : // First the first element insert the in a DHT with key as the rank and value the list of global index associated // Then get the previously stored index associate with the remote rank I am in charge and reinsert the global index // corresponding to the position of the element in the remote view suing tensorial product // finaly we get only the list of remote global index I am in charge for the whole remote grid for(int pos=0; pos> globalIndexView ; dstView[pos]->getGlobalIndexView(globalIndexView) ; CClientClientDHTTemplate::Index2VectorInfoTypeMap lastInfo(info) ; if (pos>0) { CArray ranks(globalIndexView.size()) ; auto it=globalIndexView.begin() ; for(int i=0 ; ifirst ; CClientClientDHTTemplate dataRanks(info, localComm_) ; dataRanks.computeIndexInfoMapping(ranks) ; lastInfo = dataRanks.getInfoIndexMap() ; } info.clear() ; for(auto& it : globalIndexView) { int rank = it.first ; auto& globalIndex = it.second ; auto& inf = info[rank] ; if (pos==0) for(int i=0;i dataRanks(dataInfo, localComm_) ; // generate list of global index for src view int nSrc = srcView.size() ; vector srcSliceSize(nSrc) ; srcSliceSize[0] = 1 ; for(int i=1; igetGlobalSize()*srcSliceSize[i-1] ; vector srcGlobalIndex ; size_t sliceIndex=0 ; srcView[nSrc-1]->getGlobalIndex(srcGlobalIndex, sliceIndex, srcSliceSize.data(), srcView.data(), nSrc-1) ; // now we have the global index of the source grid in srcGlobalIndex // we feed the DHT with the src global index (if we have) if (srcGlobalIndex.size()>0) { CArray srcGlobalIndexArray(srcGlobalIndex.data(), shape(srcGlobalIndex.size()),neverDeleteData) ; dataRanks.computeIndexInfoMapping(srcGlobalIndexArray) ; } else { CArray srcGlobalIndexArray ; dataRanks.computeIndexInfoMapping(srcGlobalIndexArray) ; } const auto& returnInfo = dataRanks.getInfoIndexMap() ; // returnInfo contains now the map for each global indices to send to a list of remote rank // but we want to use the tensorial product property to get the same information using only global // index of element view. So the idea is to reverse the information : for a global index of the grid // to send to the remote server, what is the global index of each element composing the grid ? vector>> elements(nSrc) ; // internal representation of elements composing the grid for(auto& indRanks : returnInfo) { size_t gridIndexGlo=indRanks.first ; auto& ranks = indRanks.second ; for(int i=nSrc-1; i>=0; i--) { auto& element = elements[i] ; size_t localIndGlo = gridIndexGlo / srcSliceSize[i] ; gridIndexGlo = gridIndexGlo % srcSliceSize[i] ; for(int rank : ranks) element[rank].insert(localIndGlo) ; } } // elements_.resize(nSrc) ; for(int i=0 ; i& indGlo = rankInd.second ; CArray& indGloArray = elements_[indElements[i]][rank] ; indGloArray.resize(indGlo.size()) ; int j=0 ; for (auto index : indGlo) { indGloArray(j) = index ; j++; } } } // So what about when there is some server that have no data to receive // they must be inform they receive an event with no data. // So find remote servers with no data, and one client will take in charge // that it receive global index with no data (0-size) vector ranks(remoteSize_,0) ; for(auto& it : elements_[indElements[0]]) ranks[it.first] = 1 ; MPI_Allreduce(MPI_IN_PLACE, ranks.data(), remoteSize_, MPI_INT, MPI_SUM, localComm_) ; int commRank, commSize ; MPI_Comm_rank(localComm_,&commRank) ; MPI_Comm_size(localComm_,&commSize) ; int pos=0 ; for(int i=0; i(0) ; pos++ ; } } /** * \brief Once the connector is computed (compute \b elements_), redondant data can be avoid to be sent to the server. * This call compute the redondant rank and store them in \b rankToRemove_ attribute. * The goal of this method is to make a hash of each block of indice that determine wich data to send to a * of a specific server rank using a hash method. So data to send to a rank is associated to a hash. * After we compare hash between local rank and remove redondant data corresponding to the same hash. */ void CGridRemoteConnector::computeRedondantRanks(void) { int commRank ; MPI_Comm_rank(localComm_,&commRank) ; set ranks; for(auto& element : elements_) for(auto& it : element) ranks.insert(it.first) ; for(auto& element : elements_) for(auto& it : element) if (ranks.count(it.first)==0) ERROR("void CGridRemoteConnector::removeRedondantRanks(void)",<<"number of ranks in elements is not coherent between each element") ; HashXIOS hashGlobalIndex; map hashRanks ; for(auto& element : elements_) for(auto& it : element) { auto& globalIndex=it.second ; int rank=it.first ; size_t hash ; hash=0 ; for(int i=0; i::Index2InfoTypeMap info ; map hashRank ; HashXIOS hashGlobalIndexRank; for(auto& it : hashRanks) { it.second = hashGlobalIndexRank.hashCombine(it.first,it.second) ; info[it.second]=commRank ; hashRank[it.second]=it.first ; } // we feed a DHT map with key : hash, value : myrank CClientClientDHTTemplate dataHash(info, localComm_) ; CArray hashList(hashRank.size()) ; int i=0 ; for(auto& it : hashRank) { hashList(i)=it.first ; i++; } // now who are the ranks that have the same hash : feed the DHT with my list of hash dataHash.computeIndexInfoMapping(hashList) ; auto& hashRankList = dataHash.getInfoIndexMap() ; for(auto& it : hashRankList) { size_t hash = it.first ; auto& ranks = it.second ; bool first=true ; // only the process with the lowest rank get in charge of sendinf data to remote server for(int rank : ranks) if (commRank>rank) first=false ; if (!first) rankToRemove_.insert(hashRank[hash]) ; } } }