source: XIOS/dev/dev_ym/XIOS_COUPLING/src/context_client.cpp @ 2310

Last change on this file since 2310 was 2310, checked in by ymipsl, 2 years ago

Implement small garbage collector for unfreed MPI windows and communicator.

YM

  • Property copyright set to
    Software name : XIOS (Xml I/O Server)
    http://forge.ipsl.jussieu.fr/ioserver
    Creation date : January 2009
    Licence : CeCCIL version2
    see license file in root directory : Licence_CeCILL_V2-en.txt
    or http://www.cecill.info/licences/Licence_CeCILL_V2-en.html
    Holder : CEA/LSCE (Laboratoire des Sciences du CLimat et de l'Environnement)
    CNRS/IPSL (Institut Pierre Simon Laplace)
    Project Manager : Yann Meurdesoif
    yann.meurdesoif@cea.fr
  • Property svn:eol-style set to native
File size: 18.6 KB
RevLine 
[591]1#include "xios_spl.hpp"
[300]2#include "context_client.hpp"
3#include "context_server.hpp"
4#include "event_client.hpp"
5#include "buffer_out.hpp"
6#include "buffer_client.hpp"
7#include "type.hpp"
8#include "event_client.hpp"
9#include "context.hpp"
[382]10#include "mpi.hpp"
[347]11#include "timer.hpp"
[401]12#include "cxios.hpp"
[1130]13#include "server.hpp"
[2130]14#include "services.hpp"
15#include <boost/functional/hash.hpp>
16#include <random>
17#include <chrono>
[300]18
[335]19namespace xios
[300]20{
[512]21    /*!
22    \param [in] parent Pointer to context on client side
23    \param [in] intraComm_ communicator of group client
24    \param [in] interComm_ communicator of group server
[983]25    \cxtSer [in] cxtSer Pointer to context of server side. (It is only used in case of attached mode).
[512]26    */
[1639]27    CContextClient::CContextClient(CContext* parent, MPI_Comm intraComm_, MPI_Comm interComm_, CContext* cxtSer)
[1853]28     : mapBufferSize_(), parentServer(cxtSer), maxBufferedEvents(4), associatedServer_(nullptr)
[300]29    {
[1757]30     
[2130]31      context_ = parent;
[595]32      intraComm = intraComm_;
33      interComm = interComm_;
[1639]34      MPI_Comm_rank(intraComm, &clientRank);
35      MPI_Comm_size(intraComm, &clientSize);
[509]36
[595]37      int flag;
[1639]38      MPI_Comm_test_inter(interComm, &flag);
[1761]39      if (flag) isAttached_=false ;
40      else  isAttached_=true ;
41
42      pureOneSided=CXios::getin<bool>("pure_one_sided",false); // pure one sided communication (for test)
43      if (isAttachedModeEnabled()) pureOneSided=false ; // no one sided in attach mode
44     
45
46
[1639]47      if (flag) MPI_Comm_remote_size(interComm, &serverSize);
48      else  MPI_Comm_size(interComm, &serverSize);
[509]49
[1232]50      computeLeader(clientRank, clientSize, serverSize, ranksServerLeader, ranksServerNotLeader);
51
[2259]52      if (flag) MPI_Intercomm_merge(interComm_,false, &interCommMerged_) ;
[1757]53     
[2259]54      MPI_Comm_split(intraComm_,clientRank,clientRank, &commSelf_) ; // for windows
[2246]55
[2130]56      auto time=chrono::system_clock::now().time_since_epoch().count() ;
57      std::default_random_engine rd(time); // not reproducible from a run to another
58      std::uniform_int_distribution<size_t> dist;
59      hashId_=dist(rd) ;
60      MPI_Bcast(&hashId_,1,MPI_SIZE_T,0,intraComm) ; // Bcast to all server of the context
61
[1757]62      timeLine = 1;
[1232]63    }
64
65    void CContextClient::computeLeader(int clientRank, int clientSize, int serverSize,
66                                       std::list<int>& rankRecvLeader,
67                                       std::list<int>& rankRecvNotLeader)
68    {
69      if ((0 == clientSize) || (0 == serverSize)) return;
70
[595]71      if (clientSize < serverSize)
72      {
73        int serverByClient = serverSize / clientSize;
74        int remain = serverSize % clientSize;
75        int rankStart = serverByClient * clientRank;
[300]76
[595]77        if (clientRank < remain)
78        {
79          serverByClient++;
80          rankStart += clientRank;
81        }
82        else
83          rankStart += remain;
84
85        for (int i = 0; i < serverByClient; i++)
[1232]86          rankRecvLeader.push_back(rankStart + i);
[1021]87
[1232]88        rankRecvNotLeader.resize(0);
[1158]89      }
[595]90      else
91      {
92        int clientByServer = clientSize / serverSize;
93        int remain = clientSize % serverSize;
94
95        if (clientRank < (clientByServer + 1) * remain)
96        {
97          if (clientRank % (clientByServer + 1) == 0)
[1232]98            rankRecvLeader.push_back(clientRank / (clientByServer + 1));
[1021]99          else
[1232]100            rankRecvNotLeader.push_back(clientRank / (clientByServer + 1));
[595]101        }
102        else
103        {
104          int rank = clientRank - (clientByServer + 1) * remain;
105          if (rank % clientByServer == 0)
[1232]106            rankRecvLeader.push_back(remain + rank / clientByServer);
[1021]107          else
[1232]108            rankRecvNotLeader.push_back(remain + rank / clientByServer);
[595]109        }
110      }
[300]111    }
112
[512]113    /*!
114    In case of attached mode, the current context must be reset to context for client
115    \param [in] event Event sent to server
116    */
[300]117    void CContextClient::sendEvent(CEventClient& event)
118    {
[731]119      list<int> ranks = event.getRanks();
[2260]120 
121//      ostringstream str ;
122//      for(auto& rank : ranks) str<<rank<<" ; " ;
123//      info(100)<<"Event "<<timeLine<<" of context "<<context_->getId()<<"  for ranks : "<<str.str()<<endl ;
124
[1377]125      if (CXios::checkEventSync)
126      {
[2189]127        int typeId, classId, typeId_in, classId_in;
128        long long timeLine_out;
129        long long timeLine_in( timeLine );
[1377]130        typeId_in=event.getTypeId() ;
131        classId_in=event.getClassId() ;
[1475]132//        MPI_Allreduce(&timeLine,&timeLine_out, 1, MPI_UINT64_T, MPI_SUM, intraComm) ; // MPI_UINT64_T standardized by MPI 3
[2189]133        MPI_Allreduce(&timeLine_in,&timeLine_out, 1, MPI_LONG_LONG_INT, MPI_SUM, intraComm) ; 
[1639]134        MPI_Allreduce(&typeId_in,&typeId, 1, MPI_INT, MPI_SUM, intraComm) ;
135        MPI_Allreduce(&classId_in,&classId, 1, MPI_INT, MPI_SUM, intraComm) ;
[1377]136        if (typeId/clientSize!=event.getTypeId() || classId/clientSize!=event.getClassId() || timeLine_out/clientSize!=timeLine)
137        {
138           ERROR("void CContextClient::sendEvent(CEventClient& event)",
[2229]139               << "Event are not coherent between client for timeline = "<<timeLine);
[1377]140        }
[2229]141       
142        vector<int> servers(serverSize,0) ;
143        auto ranks=event.getRanks() ;
144        for(auto& rank : ranks) servers[rank]=1 ;
145        MPI_Allreduce(MPI_IN_PLACE, servers.data(), serverSize,MPI_INT,MPI_SUM,intraComm) ;
146        ostringstream osstr ;
147        for(int i=0;i<serverSize;i++)  if (servers[i]==0) osstr<<i<<" , " ;
148        if (!osstr.str().empty())
149        {
150          ERROR("void CContextClient::sendEvent(CEventClient& event)",
151                 <<" Some servers will not receive the message for timeline = "<<timeLine<<endl
152                 <<"Servers are : "<<osstr.str()) ;
153        }
154
155
[1377]156      }
157
[595]158      if (!event.isEmpty())
[300]159      {
[731]160        list<int> sizes = event.getSizes();
[300]161
[1757]162         // We force the getBuffers call to be non-blocking on classical servers
[1054]163        list<CBufferOut*> buffList;
[1757]164        getBuffers(timeLine, ranks, sizes, buffList) ;
[509]165
[1757]166        event.send(timeLine, sizes, buffList);
167       
168        //for (auto itRank = ranks.begin(); itRank != ranks.end(); itRank++) buffers[*itRank]->infoBuffer() ;
[731]169
[1757]170        unlockBuffers(ranks) ;
171        checkBuffers(ranks);
[2260]172       
[300]173      }
[1761]174     
175      if (isAttachedModeEnabled()) // couldBuffer is always true in attached mode
176      {
[2130]177        while (checkBuffers(ranks)) context_->globalEventLoop() ;
178     
179        CXios::getDaemonsManager()->scheduleContext(hashId_) ;
180        while (CXios::getDaemonsManager()->isScheduledContext(hashId_)) context_->globalEventLoop() ;
[1761]181      }
182     
[1054]183      timeLine++;
184    }
185
186    /*!
[512]187    If client is also server (attached mode), after sending event, it should process right away
188    the incoming event.
189    \param [in] ranks list rank of server connected this client
190    */
[300]191    void CContextClient::waitEvent(list<int>& ranks)
192    {
[1761]193      while (checkBuffers(ranks))
194      {
[2130]195        context_->eventLoop() ;
[1761]196      }
197
198      MPI_Request req ;
199      MPI_Status status ;
200
201      MPI_Ibarrier(intraComm,&req) ;
202      int flag=false ;
203
204      do 
205      {
206        CXios::getDaemonsManager()->eventLoop() ;
207        MPI_Test(&req,&flag,&status) ;
208      } while (!flag) ;
209
210
211    }
212
213
214    void CContextClient::waitEvent_old(list<int>& ranks)
215    {
[595]216      parentServer->server->setPendingEvent();
217      while (checkBuffers(ranks))
[300]218      {
[595]219        parentServer->server->listen();
220        parentServer->server->checkPendingRequest();
[300]221      }
[386]222
[595]223      while (parentServer->server->hasPendingEvent())
[386]224      {
[595]225       parentServer->server->eventLoop();
[386]226      }
[300]227    }
228
[512]229    /*!
[1054]230     * Get buffers for each connection to the servers. This function blocks until there is enough room in the buffers unless
231     * it is explicitly requested to be non-blocking.
232     *
[1757]233     *
234     * \param [in] timeLine time line of the event which will be sent to servers
[1054]235     * \param [in] serverList list of rank of connected server
236     * \param [in] sizeList size of message corresponding to each connection
237     * \param [out] retBuffers list of buffers that can be used to store an event
238     * \param [in] nonBlocking whether this function should be non-blocking
239     * \return whether the already allocated buffers could be used
[512]240    */
[1757]241    bool CContextClient::getBuffers(const size_t timeLine, const list<int>& serverList, const list<int>& sizeList, list<CBufferOut*>& retBuffers,
[1071]242                                    bool nonBlocking /*= false*/)
[300]243    {
[1054]244      list<int>::const_iterator itServer, itSize;
[595]245      list<CClientBuffer*> bufferList;
[1054]246      map<int,CClientBuffer*>::const_iterator it;
[595]247      list<CClientBuffer*>::iterator itBuffer;
[884]248      bool areBuffersFree;
[300]249
[595]250      for (itServer = serverList.begin(); itServer != serverList.end(); itServer++)
[300]251      {
[595]252        it = buffers.find(*itServer);
253        if (it == buffers.end())
[300]254        {
[595]255          newBuffer(*itServer);
256          it = buffers.find(*itServer);
[509]257        }
[595]258        bufferList.push_back(it->second);
[300]259      }
[347]260
[2246]261      double lastTimeBuffersNotFree=0. ;
262      double time ;
263      bool doUnlockBuffers ;
[347]264      CTimer::get("Blocking time").resume();
[884]265      do
[300]266      {
[884]267        areBuffersFree = true;
[2246]268        doUnlockBuffers=false ;
269        time=MPI_Wtime() ;
270        if (time-lastTimeBuffersNotFree > latency_)
[1757]271        {
[2246]272          for (itBuffer = bufferList.begin(), itSize = sizeList.begin(); itBuffer != bufferList.end(); itBuffer++, itSize++)
273          {
274            areBuffersFree &= (*itBuffer)->isBufferFree(*itSize);
275          }
276          if (!areBuffersFree)
277          {
278            lastTimeBuffersNotFree = time ;
279            doUnlockBuffers=true ;
280          }         
[1757]281        }
[2246]282        else areBuffersFree = false ;
[884]283
284        if (!areBuffersFree)
[300]285        {
[2246]286          if (doUnlockBuffers) for (itBuffer = bufferList.begin(); itBuffer != bufferList.end(); itBuffer++) (*itBuffer)->unlockBuffer();
[884]287          checkBuffers();
[1761]288
[2246]289          context_->globalEventLoop() ;
[2130]290        }
[1130]291
[1054]292      } while (!areBuffersFree && !nonBlocking);
[347]293      CTimer::get("Blocking time").suspend();
294
[1054]295      if (areBuffersFree)
[300]296      {
[1054]297        for (itBuffer = bufferList.begin(), itSize = sizeList.begin(); itBuffer != bufferList.end(); itBuffer++, itSize++)
[1757]298          retBuffers.push_back((*itBuffer)->getBuffer(timeLine, *itSize));
[300]299      }
[1054]300      return areBuffersFree;
[300]301   }
[509]302
[512]303   /*!
304   Make a new buffer for a certain connection to server with specific rank
305   \param [in] rank rank of connected server
306   */
[300]307   void CContextClient::newBuffer(int rank)
308   {
[1201]309      if (!mapBufferSize_.count(rank))
310      {
311        error(0) << "WARNING: Unexpected request for buffer to communicate with server " << rank << std::endl;
312        mapBufferSize_[rank] = CXios::minBufferSize;
313        maxEventSizes[rank] = CXios::minBufferSize;
314      }
[1757]315     
[2259]316      CClientBuffer* buffer = buffers[rank] = new CClientBuffer(interComm, rank, mapBufferSize_[rank], maxEventSizes[rank]);
[2130]317      if (isGrowableBuffer_) buffer->setGrowableBuffer(1.2) ;
318      else buffer->fixBuffer() ;
[1201]319      // Notify the server
[2130]320      CBufferOut* bufOut = buffer->getBuffer(0, 4*sizeof(MPI_Aint));
321      MPI_Aint sendBuff[4] ;
322      sendBuff[0]=hashId_;
323      sendBuff[1]=mapBufferSize_[rank];
324      sendBuff[2]=buffers[rank]->getWinAddress(0); 
325      sendBuff[3]=buffers[rank]->getWinAddress(1); 
[1757]326      info(100)<<"CContextClient::newBuffer : rank "<<rank<<" winAdress[0] "<<buffers[rank]->getWinAddress(0)<<" winAdress[1] "<<buffers[rank]->getWinAddress(1)<<endl;
[2130]327      bufOut->put(sendBuff, 4); 
[1757]328      buffer->checkBuffer(true);
[2259]329     
330       // create windows dynamically for one-sided
331      if (!isAttachedModeEnabled())
332      { 
333        CTimer::get("create Windows").resume() ;
334        MPI_Comm interComm ;
335        MPI_Intercomm_create(commSelf_, 0, interCommMerged_, clientSize+rank, 0, &interComm) ;
336        MPI_Intercomm_merge(interComm, false, &winComm_[rank]) ;
[2310]337        CXios::getMpiGarbageCollector().registerCommunicator(winComm_[rank]) ;
[2259]338        MPI_Comm_free(&interComm) ;
339        windows_[rank].resize(2) ;
[2310]340       
[2259]341        MPI_Win_create_dynamic(MPI_INFO_NULL, winComm_[rank], &windows_[rank][0]);
[2310]342        CXios::getMpiGarbageCollector().registerWindow(windows_[rank][0]) ;
343       
[2259]344        MPI_Win_create_dynamic(MPI_INFO_NULL, winComm_[rank], &windows_[rank][1]);   
[2310]345        CXios::getMpiGarbageCollector().registerWindow(windows_[rank][1]) ;
346
[2259]347        CTimer::get("create Windows").suspend() ;
348      }
349      else
350      {
351        winComm_[rank] = MPI_COMM_NULL ;
352        windows_[rank].resize(2) ;
353        windows_[rank][0] = MPI_WIN_NULL ;
354        windows_[rank][1] = MPI_WIN_NULL ;
355      }
356      buffer->attachWindows(windows_[rank]) ;
[2260]357      if (!isAttachedModeEnabled()) MPI_Barrier(winComm_[rank]) ;
358       
[509]359   }
[300]360
[512]361   /*!
362   Verify state of buffers. Buffer is under pending state if there is no message on it
363   \return state of buffers, pending(true), ready(false)
364   */
[300]365   bool CContextClient::checkBuffers(void)
366   {
[595]367      map<int,CClientBuffer*>::iterator itBuff;
368      bool pending = false;
[1130]369      for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)
[1757]370        pending |= itBuff->second->checkBuffer(!pureOneSided);
[595]371      return pending;
[509]372   }
[300]373
[512]374   //! Release all buffers
[1071]375   void CContextClient::releaseBuffers()
[300]376   {
[2276]377      map<int,CClientBuffer*>::iterator itBuff;
378      for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)
379      {
380         delete itBuff->second;
381      }
382      buffers.clear();
[1757]383
[2240]384// don't know when release windows
[1757]385
[2287]386      //if (!isAttachedModeEnabled())
387      //{ 
388      //  for(auto& it : winComm_)
389      //  {
390      //    int rank = it.first ;
391      //    MPI_Win_free(&windows_[rank][0]);
392      //    MPI_Win_free(&windows_[rank][1]);
393      //    MPI_Comm_free(&winComm_[rank]) ;
394      //  }
395      //}
[509]396   }
[1761]397
[1757]398     
399  /*!
400   Lock the buffers for one sided communications
401   \param [in] ranks list rank of server to which client connects to
402   */
403   void CContextClient::lockBuffers(list<int>& ranks)
404   {
405      list<int>::iterator it;
406      for (it = ranks.begin(); it != ranks.end(); it++) buffers[*it]->lockBuffer();
407   }
[300]408
[1757]409  /*!
410   Unlock the buffers for one sided communications
411   \param [in] ranks list rank of server to which client connects to
412   */
413   void CContextClient::unlockBuffers(list<int>& ranks)
414   {
415      list<int>::iterator it;
416      for (it = ranks.begin(); it != ranks.end(); it++) buffers[*it]->unlockBuffer();
417   }
418     
[512]419   /*!
420   Verify state of buffers corresponding to a connection
421   \param [in] ranks list rank of server to which client connects to
422   \return state of buffers, pending(true), ready(false)
423   */
[300]424   bool CContextClient::checkBuffers(list<int>& ranks)
425   {
[595]426      list<int>::iterator it;
427      bool pending = false;
[1757]428      for (it = ranks.begin(); it != ranks.end(); it++) pending |= buffers[*it]->checkBuffer(!pureOneSided);
[595]429      return pending;
[509]430   }
[300]431
[512]432   /*!
[917]433    * Set the buffer size for each connection. Warning: This function is collective.
434    *
435    * \param [in] mapSize maps the rank of the connected servers to the size of the correspoinding buffer
436    * \param [in] maxEventSize maps the rank of the connected servers to the size of the biggest event
[512]437   */
[2130]438   void CContextClient::setBufferSize(const std::map<int,StdSize>& mapSize)
[509]439   {
[2176]440     for(auto& it : mapSize) 
441      buffers[it.first]->fixBufferSize(std::max(CXios::minBufferSize*1.0,std::min(it.second*CXios::bufferSizeFactor*1.01,CXios::maxBufferSize*1.0)));
[509]442   }
443
[1158]444  /*!
445  Get leading server in the group of connected server
446  \return ranks of leading servers
447  */
448  const std::list<int>& CContextClient::getRanksServerNotLeader(void) const
449  {
450    return ranksServerNotLeader;
451  }
[1021]452
[1158]453  /*!
454  Check if client connects to leading server
455  \return connected(true), not connected (false)
456  */
457  bool CContextClient::isServerNotLeader(void) const
458  {
459    return !ranksServerNotLeader.empty();
460  }
[1021]461
[595]462  /*!
463  Get leading server in the group of connected server
464  \return ranks of leading servers
465  */
466  const std::list<int>& CContextClient::getRanksServerLeader(void) const
467  {
468    return ranksServerLeader;
469  }
[509]470
[595]471  /*!
472  Check if client connects to leading server
473  \return connected(true), not connected (false)
474  */
475  bool CContextClient::isServerLeader(void) const
476  {
477    return !ranksServerLeader.empty();
478  }
[300]479
[512]480   /*!
[1130]481   * Finalize context client and do some reports. Function is non-blocking.
[512]482   */
[1130]483  void CContextClient::finalize(void)
[1054]484  {
485    map<int,CClientBuffer*>::iterator itBuff;
[1757]486    std::list<int>::iterator ItServerLeader; 
487   
[1054]488    bool stop = false;
[731]489
[1757]490    int* nbServerConnectionLocal  = new int[serverSize] ;
491    int* nbServerConnectionGlobal  = new int[serverSize] ;
492    for(int i=0;i<serverSize;++i) nbServerConnectionLocal[i]=0 ;
493    for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)  nbServerConnectionLocal[itBuff->first]=1 ;
494    for (ItServerLeader = ranksServerLeader.begin(); ItServerLeader != ranksServerLeader.end(); ItServerLeader++)  nbServerConnectionLocal[*ItServerLeader]=1 ;
495   
496    MPI_Allreduce(nbServerConnectionLocal, nbServerConnectionGlobal, serverSize, MPI_INT, MPI_SUM, intraComm);
497   
498    CEventClient event(CContext::GetType(), CContext::EVENT_ID_CONTEXT_FINALIZE);
499    CMessage msg;
[509]500
[1757]501    for (int i=0;i<serverSize;++i) if (nbServerConnectionLocal[i]==1) event.push(i, nbServerConnectionGlobal[i], msg) ;
502    sendEvent(event);
503
504    delete[] nbServerConnectionLocal ;
505    delete[] nbServerConnectionGlobal ;
[509]506
[1765]507
[1054]508    CTimer::get("Blocking time").resume();
[1757]509    checkBuffers();
[1054]510    CTimer::get("Blocking time").suspend();
511
512    std::map<int,StdSize>::const_iterator itbMap = mapBufferSize_.begin(),
513                                          iteMap = mapBufferSize_.end(), itMap;
[1071]514
[1054]515    StdSize totalBuf = 0;
516    for (itMap = itbMap; itMap != iteMap; ++itMap)
517    {
[2130]518      report(10) << " Memory report : Context <" << context_->getId() << "> : client side : memory used for buffer of each connection to server" << endl
[1054]519                 << "  +) To server with rank " << itMap->first << " : " << itMap->second << " bytes " << endl;
520      totalBuf += itMap->second;
521    }
[2130]522    report(0) << " Memory report : Context <" << context_->getId() << "> : client side : total memory used for buffer " << totalBuf << " bytes" << endl;
[1054]523
524  }
[1130]525
[1139]526
527  /*!
528  */
[1130]529  bool CContextClient::havePendingRequests(void)
530  {
531    bool pending = false;
532    map<int,CClientBuffer*>::iterator itBuff;
533    for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)
534      pending |= itBuff->second->hasPendingRequest();
535    return pending;
536  }
[1757]537 
[2260]538  bool CContextClient::havePendingRequests(list<int>& ranks)
539  {
540      list<int>::iterator it;
541      bool pending = false;
542      for (it = ranks.begin(); it != ranks.end(); it++) pending |= buffers[*it]->hasPendingRequest();
543      return pending;
544  }
545
[1757]546  bool CContextClient::isNotifiedFinalized(void)
547  {
548    if (isAttachedModeEnabled()) return true ;
[1130]549
[1757]550    bool finalized = true;
551    map<int,CClientBuffer*>::iterator itBuff;
552    for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)
553      finalized &= itBuff->second->isNotifiedFinalized();
554    return finalized;
555  }
[1130]556
[509]557}
Note: See TracBrowser for help on using the repository browser.