source: XIOS/dev/dev_ym/XIOS_COUPLING/src/context_client.cpp @ 2255

Last change on this file since 2255 was 2246, checked in by ymipsl, 3 years ago
  • Update of the tranfer protocol using one sided communication
  • Introduce MPI_Improb/MPI_mrecv to listen incomming request
  • Introducing latency when looping over managers

YM

  • Property copyright set to
    Software name : XIOS (Xml I/O Server)
    http://forge.ipsl.jussieu.fr/ioserver
    Creation date : January 2009
    Licence : CeCCIL version2
    see license file in root directory : Licence_CeCILL_V2-en.txt
    or http://www.cecill.info/licences/Licence_CeCILL_V2-en.html
    Holder : CEA/LSCE (Laboratoire des Sciences du CLimat et de l'Environnement)
    CNRS/IPSL (Institut Pierre Simon Laplace)
    Project Manager : Yann Meurdesoif
    yann.meurdesoif@cea.fr
  • Property svn:eol-style set to native
File size: 18.6 KB
RevLine 
[591]1#include "xios_spl.hpp"
[300]2#include "context_client.hpp"
3#include "context_server.hpp"
4#include "event_client.hpp"
5#include "buffer_out.hpp"
6#include "buffer_client.hpp"
7#include "type.hpp"
8#include "event_client.hpp"
9#include "context.hpp"
[382]10#include "mpi.hpp"
[347]11#include "timer.hpp"
[401]12#include "cxios.hpp"
[1130]13#include "server.hpp"
[2130]14#include "services.hpp"
15#include <boost/functional/hash.hpp>
16#include <random>
17#include <chrono>
[300]18
[335]19namespace xios
[300]20{
[512]21    /*!
22    \param [in] parent Pointer to context on client side
23    \param [in] intraComm_ communicator of group client
24    \param [in] interComm_ communicator of group server
[983]25    \cxtSer [in] cxtSer Pointer to context of server side. (It is only used in case of attached mode).
[512]26    */
[1639]27    CContextClient::CContextClient(CContext* parent, MPI_Comm intraComm_, MPI_Comm interComm_, CContext* cxtSer)
[1853]28     : mapBufferSize_(), parentServer(cxtSer), maxBufferedEvents(4), associatedServer_(nullptr)
[300]29    {
[1757]30     
[2130]31      context_ = parent;
[595]32      intraComm = intraComm_;
33      interComm = interComm_;
[1639]34      MPI_Comm_rank(intraComm, &clientRank);
35      MPI_Comm_size(intraComm, &clientSize);
[509]36
[595]37      int flag;
[1639]38      MPI_Comm_test_inter(interComm, &flag);
[1761]39      if (flag) isAttached_=false ;
40      else  isAttached_=true ;
41
42      pureOneSided=CXios::getin<bool>("pure_one_sided",false); // pure one sided communication (for test)
43      if (isAttachedModeEnabled()) pureOneSided=false ; // no one sided in attach mode
44     
45
46
[1639]47      if (flag) MPI_Comm_remote_size(interComm, &serverSize);
48      else  MPI_Comm_size(interComm, &serverSize);
[509]49
[1232]50      computeLeader(clientRank, clientSize, serverSize, ranksServerLeader, ranksServerNotLeader);
51
[2246]52      if (flag) MPI_Intercomm_merge(interComm_,false, &interCommMerged) ;
[1757]53     
54      if (!isAttachedModeEnabled())
55      { 
[2246]56
57        CTimer::get("create Windows").resume() ;
58
59        // We create dummy pair of intercommunicator between clients and server
60        // Why ? Just because on openMPI, it reduce the creation time of windows otherwhise which increase quadratically
61        // We don't know the reason
62     
63        MPI_Comm commSelf ;
64        MPI_Comm_split(intraComm_,clientRank,clientRank, &commSelf) ;
65        vector<MPI_Comm> dummyComm(serverSize) ;
66        for(int rank=0; rank<serverSize; rank++) MPI_Intercomm_create(commSelf, 0, interCommMerged, clientSize+rank, 0, &dummyComm[rank]) ;
67
68        // create windows for one-sided
[1757]69        windows.resize(serverSize) ;
70        MPI_Comm winComm ;
71        for(int rank=0; rank<serverSize; rank++)
72        {
73          windows[rank].resize(2) ;
74          MPI_Comm_split(interCommMerged, rank, clientRank, &winComm);
75          MPI_Win_create_dynamic(MPI_INFO_NULL, winComm, &windows[rank][0]);
76          MPI_Win_create_dynamic(MPI_INFO_NULL, winComm, &windows[rank][1]);
[2222]77//       ym : Warning : intelMPI doesn't support that communicator of windows be deallocated before the windows deallocation, crash at MPI_Win_lock
78//            Bug or not ?         
[2246]79//          MPI_Comm_free(&winComm) ;
[1757]80        }
[2246]81       
82        // free dummy intercommunicator => take times ?
83        for(int rank=0; rank<serverSize; rank++)  MPI_Comm_free(&dummyComm[rank]) ;
84        MPI_Comm_free(&commSelf) ;
[1757]85
[2246]86        CTimer::get("create Windows").resume() ;
87     }
[1757]88
[2130]89      auto time=chrono::system_clock::now().time_since_epoch().count() ;
90      std::default_random_engine rd(time); // not reproducible from a run to another
91      std::uniform_int_distribution<size_t> dist;
92      hashId_=dist(rd) ;
93      MPI_Bcast(&hashId_,1,MPI_SIZE_T,0,intraComm) ; // Bcast to all server of the context
94
[1757]95      timeLine = 1;
[1232]96    }
97
98    void CContextClient::computeLeader(int clientRank, int clientSize, int serverSize,
99                                       std::list<int>& rankRecvLeader,
100                                       std::list<int>& rankRecvNotLeader)
101    {
102      if ((0 == clientSize) || (0 == serverSize)) return;
103
[595]104      if (clientSize < serverSize)
105      {
106        int serverByClient = serverSize / clientSize;
107        int remain = serverSize % clientSize;
108        int rankStart = serverByClient * clientRank;
[300]109
[595]110        if (clientRank < remain)
111        {
112          serverByClient++;
113          rankStart += clientRank;
114        }
115        else
116          rankStart += remain;
117
118        for (int i = 0; i < serverByClient; i++)
[1232]119          rankRecvLeader.push_back(rankStart + i);
[1021]120
[1232]121        rankRecvNotLeader.resize(0);
[1158]122      }
[595]123      else
124      {
125        int clientByServer = clientSize / serverSize;
126        int remain = clientSize % serverSize;
127
128        if (clientRank < (clientByServer + 1) * remain)
129        {
130          if (clientRank % (clientByServer + 1) == 0)
[1232]131            rankRecvLeader.push_back(clientRank / (clientByServer + 1));
[1021]132          else
[1232]133            rankRecvNotLeader.push_back(clientRank / (clientByServer + 1));
[595]134        }
135        else
136        {
137          int rank = clientRank - (clientByServer + 1) * remain;
138          if (rank % clientByServer == 0)
[1232]139            rankRecvLeader.push_back(remain + rank / clientByServer);
[1021]140          else
[1232]141            rankRecvNotLeader.push_back(remain + rank / clientByServer);
[595]142        }
143      }
[300]144    }
145
[512]146    /*!
147    In case of attached mode, the current context must be reset to context for client
148    \param [in] event Event sent to server
149    */
[300]150    void CContextClient::sendEvent(CEventClient& event)
151    {
[731]152      list<int> ranks = event.getRanks();
[2130]153      info(100)<<"Event "<<timeLine<<" of context "<<context_->getId()<<endl ;
[1377]154      if (CXios::checkEventSync)
155      {
[2189]156        int typeId, classId, typeId_in, classId_in;
157        long long timeLine_out;
158        long long timeLine_in( timeLine );
[1377]159        typeId_in=event.getTypeId() ;
160        classId_in=event.getClassId() ;
[1475]161//        MPI_Allreduce(&timeLine,&timeLine_out, 1, MPI_UINT64_T, MPI_SUM, intraComm) ; // MPI_UINT64_T standardized by MPI 3
[2189]162        MPI_Allreduce(&timeLine_in,&timeLine_out, 1, MPI_LONG_LONG_INT, MPI_SUM, intraComm) ; 
[1639]163        MPI_Allreduce(&typeId_in,&typeId, 1, MPI_INT, MPI_SUM, intraComm) ;
164        MPI_Allreduce(&classId_in,&classId, 1, MPI_INT, MPI_SUM, intraComm) ;
[1377]165        if (typeId/clientSize!=event.getTypeId() || classId/clientSize!=event.getClassId() || timeLine_out/clientSize!=timeLine)
166        {
167           ERROR("void CContextClient::sendEvent(CEventClient& event)",
[2229]168               << "Event are not coherent between client for timeline = "<<timeLine);
[1377]169        }
[2229]170       
171        vector<int> servers(serverSize,0) ;
172        auto ranks=event.getRanks() ;
173        for(auto& rank : ranks) servers[rank]=1 ;
174        MPI_Allreduce(MPI_IN_PLACE, servers.data(), serverSize,MPI_INT,MPI_SUM,intraComm) ;
175        ostringstream osstr ;
176        for(int i=0;i<serverSize;i++)  if (servers[i]==0) osstr<<i<<" , " ;
177        if (!osstr.str().empty())
178        {
179          ERROR("void CContextClient::sendEvent(CEventClient& event)",
180                 <<" Some servers will not receive the message for timeline = "<<timeLine<<endl
181                 <<"Servers are : "<<osstr.str()) ;
182        }
183
184
[1377]185      }
186
[595]187      if (!event.isEmpty())
[300]188      {
[731]189        list<int> sizes = event.getSizes();
[300]190
[1757]191         // We force the getBuffers call to be non-blocking on classical servers
[1054]192        list<CBufferOut*> buffList;
[1757]193        getBuffers(timeLine, ranks, sizes, buffList) ;
[509]194
[1757]195        event.send(timeLine, sizes, buffList);
196       
197        //for (auto itRank = ranks.begin(); itRank != ranks.end(); itRank++) buffers[*itRank]->infoBuffer() ;
[731]198
[1757]199        unlockBuffers(ranks) ;
[2130]200        info(100)<<"Event "<<timeLine<<" of context "<<context_->getId()<<"  sent"<<endl ;
[1757]201         
202        checkBuffers(ranks);
[300]203      }
[1761]204     
205      if (isAttachedModeEnabled()) // couldBuffer is always true in attached mode
206      {
[2130]207        while (checkBuffers(ranks)) context_->globalEventLoop() ;
208     
209        CXios::getDaemonsManager()->scheduleContext(hashId_) ;
210        while (CXios::getDaemonsManager()->isScheduledContext(hashId_)) context_->globalEventLoop() ;
[1761]211      }
212     
[1054]213      timeLine++;
214    }
215
216    /*!
[512]217    If client is also server (attached mode), after sending event, it should process right away
218    the incoming event.
219    \param [in] ranks list rank of server connected this client
220    */
[300]221    void CContextClient::waitEvent(list<int>& ranks)
222    {
[1761]223      while (checkBuffers(ranks))
224      {
[2130]225        context_->eventLoop() ;
[1761]226      }
227
228      MPI_Request req ;
229      MPI_Status status ;
230
231      MPI_Ibarrier(intraComm,&req) ;
232      int flag=false ;
233
234      do 
235      {
236        CXios::getDaemonsManager()->eventLoop() ;
237        MPI_Test(&req,&flag,&status) ;
238      } while (!flag) ;
239
240
241    }
242
243
244    void CContextClient::waitEvent_old(list<int>& ranks)
245    {
[595]246      parentServer->server->setPendingEvent();
247      while (checkBuffers(ranks))
[300]248      {
[595]249        parentServer->server->listen();
250        parentServer->server->checkPendingRequest();
[300]251      }
[386]252
[595]253      while (parentServer->server->hasPendingEvent())
[386]254      {
[595]255       parentServer->server->eventLoop();
[386]256      }
[300]257    }
258
[512]259    /*!
[1054]260     * Get buffers for each connection to the servers. This function blocks until there is enough room in the buffers unless
261     * it is explicitly requested to be non-blocking.
262     *
[1757]263     *
264     * \param [in] timeLine time line of the event which will be sent to servers
[1054]265     * \param [in] serverList list of rank of connected server
266     * \param [in] sizeList size of message corresponding to each connection
267     * \param [out] retBuffers list of buffers that can be used to store an event
268     * \param [in] nonBlocking whether this function should be non-blocking
269     * \return whether the already allocated buffers could be used
[512]270    */
[1757]271    bool CContextClient::getBuffers(const size_t timeLine, const list<int>& serverList, const list<int>& sizeList, list<CBufferOut*>& retBuffers,
[1071]272                                    bool nonBlocking /*= false*/)
[300]273    {
[1054]274      list<int>::const_iterator itServer, itSize;
[595]275      list<CClientBuffer*> bufferList;
[1054]276      map<int,CClientBuffer*>::const_iterator it;
[595]277      list<CClientBuffer*>::iterator itBuffer;
[884]278      bool areBuffersFree;
[300]279
[595]280      for (itServer = serverList.begin(); itServer != serverList.end(); itServer++)
[300]281      {
[595]282        it = buffers.find(*itServer);
283        if (it == buffers.end())
[300]284        {
[595]285          newBuffer(*itServer);
286          it = buffers.find(*itServer);
[509]287        }
[595]288        bufferList.push_back(it->second);
[300]289      }
[347]290
[2246]291      double lastTimeBuffersNotFree=0. ;
292      double time ;
293      bool doUnlockBuffers ;
[347]294      CTimer::get("Blocking time").resume();
[884]295      do
[300]296      {
[884]297        areBuffersFree = true;
[2246]298        doUnlockBuffers=false ;
299        time=MPI_Wtime() ;
300        if (time-lastTimeBuffersNotFree > latency_)
[1757]301        {
[2246]302          for (itBuffer = bufferList.begin(), itSize = sizeList.begin(); itBuffer != bufferList.end(); itBuffer++, itSize++)
303          {
304            areBuffersFree &= (*itBuffer)->isBufferFree(*itSize);
305          }
306          if (!areBuffersFree)
307          {
308            lastTimeBuffersNotFree = time ;
309            doUnlockBuffers=true ;
310          }         
[1757]311        }
[2246]312        else areBuffersFree = false ;
[884]313
314        if (!areBuffersFree)
[300]315        {
[2246]316          if (doUnlockBuffers) for (itBuffer = bufferList.begin(); itBuffer != bufferList.end(); itBuffer++) (*itBuffer)->unlockBuffer();
[884]317          checkBuffers();
[1761]318
[2246]319          context_->globalEventLoop() ;
[2130]320        }
[1130]321
[1054]322      } while (!areBuffersFree && !nonBlocking);
[347]323      CTimer::get("Blocking time").suspend();
324
[1054]325      if (areBuffersFree)
[300]326      {
[1054]327        for (itBuffer = bufferList.begin(), itSize = sizeList.begin(); itBuffer != bufferList.end(); itBuffer++, itSize++)
[1757]328          retBuffers.push_back((*itBuffer)->getBuffer(timeLine, *itSize));
[300]329      }
[1054]330      return areBuffersFree;
[300]331   }
[509]332
[512]333   /*!
334   Make a new buffer for a certain connection to server with specific rank
335   \param [in] rank rank of connected server
336   */
[300]337   void CContextClient::newBuffer(int rank)
338   {
[1201]339      if (!mapBufferSize_.count(rank))
340      {
341        error(0) << "WARNING: Unexpected request for buffer to communicate with server " << rank << std::endl;
342        mapBufferSize_[rank] = CXios::minBufferSize;
343        maxEventSizes[rank] = CXios::minBufferSize;
344      }
[1757]345     
346      vector<MPI_Win> Wins(2,MPI_WIN_NULL) ;
347      if (!isAttachedModeEnabled()) Wins=windows[rank] ;
348 
349      CClientBuffer* buffer = buffers[rank] = new CClientBuffer(interComm, Wins, clientRank, rank, mapBufferSize_[rank], maxEventSizes[rank]);
[2130]350      if (isGrowableBuffer_) buffer->setGrowableBuffer(1.2) ;
351      else buffer->fixBuffer() ;
[1201]352      // Notify the server
[2130]353      CBufferOut* bufOut = buffer->getBuffer(0, 4*sizeof(MPI_Aint));
354      MPI_Aint sendBuff[4] ;
355      sendBuff[0]=hashId_;
356      sendBuff[1]=mapBufferSize_[rank];
357      sendBuff[2]=buffers[rank]->getWinAddress(0); 
358      sendBuff[3]=buffers[rank]->getWinAddress(1); 
[1757]359      info(100)<<"CContextClient::newBuffer : rank "<<rank<<" winAdress[0] "<<buffers[rank]->getWinAddress(0)<<" winAdress[1] "<<buffers[rank]->getWinAddress(1)<<endl;
[2130]360      bufOut->put(sendBuff, 4); 
[1757]361      buffer->checkBuffer(true);
362
[509]363   }
[300]364
[512]365   /*!
366   Verify state of buffers. Buffer is under pending state if there is no message on it
367   \return state of buffers, pending(true), ready(false)
368   */
[300]369   bool CContextClient::checkBuffers(void)
370   {
[595]371      map<int,CClientBuffer*>::iterator itBuff;
372      bool pending = false;
[1130]373      for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)
[1757]374        pending |= itBuff->second->checkBuffer(!pureOneSided);
[595]375      return pending;
[509]376   }
[300]377
[512]378   //! Release all buffers
[1071]379   void CContextClient::releaseBuffers()
[300]380   {
[595]381      map<int,CClientBuffer*>::iterator itBuff;
[1077]382      for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)
[1139]383      {
[1757]384         delete itBuff->second;
[1139]385      }
[1077]386      buffers.clear();
[1757]387
[2240]388// don't know when release windows
[1757]389
390      if (!isAttachedModeEnabled())
391      { 
392        for(int rank=0; rank<serverSize; rank++)
393        {
394          MPI_Win_free(&windows[rank][0]);
395          MPI_Win_free(&windows[rank][1]);
396        }
397      } 
[509]398   }
[1761]399
[1757]400     
401  /*!
402   Lock the buffers for one sided communications
403   \param [in] ranks list rank of server to which client connects to
404   */
405   void CContextClient::lockBuffers(list<int>& ranks)
406   {
407      list<int>::iterator it;
408      for (it = ranks.begin(); it != ranks.end(); it++) buffers[*it]->lockBuffer();
409   }
[300]410
[1757]411  /*!
412   Unlock the buffers for one sided communications
413   \param [in] ranks list rank of server to which client connects to
414   */
415   void CContextClient::unlockBuffers(list<int>& ranks)
416   {
417      list<int>::iterator it;
418      for (it = ranks.begin(); it != ranks.end(); it++) buffers[*it]->unlockBuffer();
419   }
420     
[512]421   /*!
422   Verify state of buffers corresponding to a connection
423   \param [in] ranks list rank of server to which client connects to
424   \return state of buffers, pending(true), ready(false)
425   */
[300]426   bool CContextClient::checkBuffers(list<int>& ranks)
427   {
[595]428      list<int>::iterator it;
429      bool pending = false;
[1757]430      for (it = ranks.begin(); it != ranks.end(); it++) pending |= buffers[*it]->checkBuffer(!pureOneSided);
[595]431      return pending;
[509]432   }
[300]433
[512]434   /*!
[917]435    * Set the buffer size for each connection. Warning: This function is collective.
436    *
437    * \param [in] mapSize maps the rank of the connected servers to the size of the correspoinding buffer
438    * \param [in] maxEventSize maps the rank of the connected servers to the size of the biggest event
[512]439   */
[2130]440   void CContextClient::setBufferSize(const std::map<int,StdSize>& mapSize)
[509]441   {
[2176]442     for(auto& it : mapSize) 
443      buffers[it.first]->fixBufferSize(std::max(CXios::minBufferSize*1.0,std::min(it.second*CXios::bufferSizeFactor*1.01,CXios::maxBufferSize*1.0)));
[509]444   }
445
[1158]446  /*!
447  Get leading server in the group of connected server
448  \return ranks of leading servers
449  */
450  const std::list<int>& CContextClient::getRanksServerNotLeader(void) const
451  {
452    return ranksServerNotLeader;
453  }
[1021]454
[1158]455  /*!
456  Check if client connects to leading server
457  \return connected(true), not connected (false)
458  */
459  bool CContextClient::isServerNotLeader(void) const
460  {
461    return !ranksServerNotLeader.empty();
462  }
[1021]463
[595]464  /*!
465  Get leading server in the group of connected server
466  \return ranks of leading servers
467  */
468  const std::list<int>& CContextClient::getRanksServerLeader(void) const
469  {
470    return ranksServerLeader;
471  }
[509]472
[595]473  /*!
474  Check if client connects to leading server
475  \return connected(true), not connected (false)
476  */
477  bool CContextClient::isServerLeader(void) const
478  {
479    return !ranksServerLeader.empty();
480  }
[300]481
[512]482   /*!
[1130]483   * Finalize context client and do some reports. Function is non-blocking.
[512]484   */
[1130]485  void CContextClient::finalize(void)
[1054]486  {
487    map<int,CClientBuffer*>::iterator itBuff;
[1757]488    std::list<int>::iterator ItServerLeader; 
489   
[1054]490    bool stop = false;
[731]491
[1757]492    int* nbServerConnectionLocal  = new int[serverSize] ;
493    int* nbServerConnectionGlobal  = new int[serverSize] ;
494    for(int i=0;i<serverSize;++i) nbServerConnectionLocal[i]=0 ;
495    for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)  nbServerConnectionLocal[itBuff->first]=1 ;
496    for (ItServerLeader = ranksServerLeader.begin(); ItServerLeader != ranksServerLeader.end(); ItServerLeader++)  nbServerConnectionLocal[*ItServerLeader]=1 ;
497   
498    MPI_Allreduce(nbServerConnectionLocal, nbServerConnectionGlobal, serverSize, MPI_INT, MPI_SUM, intraComm);
499   
500    CEventClient event(CContext::GetType(), CContext::EVENT_ID_CONTEXT_FINALIZE);
501    CMessage msg;
[509]502
[1757]503    for (int i=0;i<serverSize;++i) if (nbServerConnectionLocal[i]==1) event.push(i, nbServerConnectionGlobal[i], msg) ;
504    sendEvent(event);
505
506    delete[] nbServerConnectionLocal ;
507    delete[] nbServerConnectionGlobal ;
[509]508
[1765]509
[1054]510    CTimer::get("Blocking time").resume();
[1757]511    checkBuffers();
[1054]512    CTimer::get("Blocking time").suspend();
513
514    std::map<int,StdSize>::const_iterator itbMap = mapBufferSize_.begin(),
515                                          iteMap = mapBufferSize_.end(), itMap;
[1071]516
[1054]517    StdSize totalBuf = 0;
518    for (itMap = itbMap; itMap != iteMap; ++itMap)
519    {
[2130]520      report(10) << " Memory report : Context <" << context_->getId() << "> : client side : memory used for buffer of each connection to server" << endl
[1054]521                 << "  +) To server with rank " << itMap->first << " : " << itMap->second << " bytes " << endl;
522      totalBuf += itMap->second;
523    }
[2130]524    report(0) << " Memory report : Context <" << context_->getId() << "> : client side : total memory used for buffer " << totalBuf << " bytes" << endl;
[1054]525
526  }
[1130]527
[1139]528
529  /*!
530  */
[1130]531  bool CContextClient::havePendingRequests(void)
532  {
533    bool pending = false;
534    map<int,CClientBuffer*>::iterator itBuff;
535    for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)
536      pending |= itBuff->second->hasPendingRequest();
537    return pending;
538  }
[1757]539 
540  bool CContextClient::isNotifiedFinalized(void)
541  {
542    if (isAttachedModeEnabled()) return true ;
[1130]543
[1757]544    bool finalized = true;
545    map<int,CClientBuffer*>::iterator itBuff;
546    for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)
547      finalized &= itBuff->second->isNotifiedFinalized();
548    return finalized;
549  }
[1130]550
[509]551}
Note: See TracBrowser for help on using the repository browser.