source: XIOS/dev/dev_ym/XIOS_COUPLING/src/context_client.cpp @ 2240

Last change on this file since 2240 was 2240, checked in by jderouillat, 3 years ago

Operate MPI windows free. Otherwise MPI will try to free it itself without respecting an adapted order, which will lead to a deadlock

  • Property copyright set to
    Software name : XIOS (Xml I/O Server)
    http://forge.ipsl.jussieu.fr/ioserver
    Creation date : January 2009
    Licence : CeCCIL version2
    see license file in root directory : Licence_CeCILL_V2-en.txt
    or http://www.cecill.info/licences/Licence_CeCILL_V2-en.html
    Holder : CEA/LSCE (Laboratoire des Sciences du CLimat et de l'Environnement)
    CNRS/IPSL (Institut Pierre Simon Laplace)
    Project Manager : Yann Meurdesoif
    yann.meurdesoif@cea.fr
  • Property svn:eol-style set to native
File size: 18.1 KB
RevLine 
[591]1#include "xios_spl.hpp"
[300]2#include "context_client.hpp"
3#include "context_server.hpp"
4#include "event_client.hpp"
5#include "buffer_out.hpp"
6#include "buffer_client.hpp"
7#include "type.hpp"
8#include "event_client.hpp"
9#include "context.hpp"
[382]10#include "mpi.hpp"
[347]11#include "timer.hpp"
[401]12#include "cxios.hpp"
[1130]13#include "server.hpp"
[2130]14#include "services.hpp"
15#include <boost/functional/hash.hpp>
16#include <random>
17#include <chrono>
[300]18
[335]19namespace xios
[300]20{
[512]21    /*!
22    \param [in] parent Pointer to context on client side
23    \param [in] intraComm_ communicator of group client
24    \param [in] interComm_ communicator of group server
[983]25    \cxtSer [in] cxtSer Pointer to context of server side. (It is only used in case of attached mode).
[512]26    */
[1639]27    CContextClient::CContextClient(CContext* parent, MPI_Comm intraComm_, MPI_Comm interComm_, CContext* cxtSer)
[1853]28     : mapBufferSize_(), parentServer(cxtSer), maxBufferedEvents(4), associatedServer_(nullptr)
[300]29    {
[1757]30     
[2130]31      context_ = parent;
[595]32      intraComm = intraComm_;
33      interComm = interComm_;
[1639]34      MPI_Comm_rank(intraComm, &clientRank);
35      MPI_Comm_size(intraComm, &clientSize);
[509]36
[595]37      int flag;
[1639]38      MPI_Comm_test_inter(interComm, &flag);
[1761]39      if (flag) isAttached_=false ;
40      else  isAttached_=true ;
41
42      pureOneSided=CXios::getin<bool>("pure_one_sided",false); // pure one sided communication (for test)
43      if (isAttachedModeEnabled()) pureOneSided=false ; // no one sided in attach mode
44     
45
46
[1639]47      if (flag) MPI_Comm_remote_size(interComm, &serverSize);
48      else  MPI_Comm_size(interComm, &serverSize);
[509]49
[1232]50      computeLeader(clientRank, clientSize, serverSize, ranksServerLeader, ranksServerNotLeader);
51
[2229]52      if (flag) 
53      {
54        MPI_Intercomm_merge(interComm_,false, &interCommMerged) ;
55        int interCommMergedRank;
56        MPI_Comm_rank(interComm_, &interCommMergedRank);
57        MPI_Comm_rank(interCommMerged, &interCommMergedRank);
58        MPI_Comm_rank(intraComm, &interCommMergedRank);
59      }
[1757]60     
61      if (!isAttachedModeEnabled())
62      { 
63        windows.resize(serverSize) ;
64        MPI_Comm winComm ;
65        for(int rank=0; rank<serverSize; rank++)
66        {
67          windows[rank].resize(2) ;
68          MPI_Comm_split(interCommMerged, rank, clientRank, &winComm);
69          int myRank ;
70          MPI_Comm_rank(winComm,&myRank);
71          MPI_Win_create_dynamic(MPI_INFO_NULL, winComm, &windows[rank][0]);
72          MPI_Win_create_dynamic(MPI_INFO_NULL, winComm, &windows[rank][1]);
[2222]73//       ym : Warning : intelMPI doesn't support that communicator of windows be deallocated before the windows deallocation, crash at MPI_Win_lock
74//            Bug or not ?         
75//        MPI_Comm_free(&winComm) ;
[1757]76        }
77      }
78
79      MPI_Comm_split(intraComm_,clientRank,clientRank, &commSelf) ;
80
[2130]81      auto time=chrono::system_clock::now().time_since_epoch().count() ;
82      std::default_random_engine rd(time); // not reproducible from a run to another
83      std::uniform_int_distribution<size_t> dist;
84      hashId_=dist(rd) ;
85      MPI_Bcast(&hashId_,1,MPI_SIZE_T,0,intraComm) ; // Bcast to all server of the context
86
[1757]87      timeLine = 1;
[1232]88    }
89
90    void CContextClient::computeLeader(int clientRank, int clientSize, int serverSize,
91                                       std::list<int>& rankRecvLeader,
92                                       std::list<int>& rankRecvNotLeader)
93    {
94      if ((0 == clientSize) || (0 == serverSize)) return;
95
[595]96      if (clientSize < serverSize)
97      {
98        int serverByClient = serverSize / clientSize;
99        int remain = serverSize % clientSize;
100        int rankStart = serverByClient * clientRank;
[300]101
[595]102        if (clientRank < remain)
103        {
104          serverByClient++;
105          rankStart += clientRank;
106        }
107        else
108          rankStart += remain;
109
110        for (int i = 0; i < serverByClient; i++)
[1232]111          rankRecvLeader.push_back(rankStart + i);
[1021]112
[1232]113        rankRecvNotLeader.resize(0);
[1158]114      }
[595]115      else
116      {
117        int clientByServer = clientSize / serverSize;
118        int remain = clientSize % serverSize;
119
120        if (clientRank < (clientByServer + 1) * remain)
121        {
122          if (clientRank % (clientByServer + 1) == 0)
[1232]123            rankRecvLeader.push_back(clientRank / (clientByServer + 1));
[1021]124          else
[1232]125            rankRecvNotLeader.push_back(clientRank / (clientByServer + 1));
[595]126        }
127        else
128        {
129          int rank = clientRank - (clientByServer + 1) * remain;
130          if (rank % clientByServer == 0)
[1232]131            rankRecvLeader.push_back(remain + rank / clientByServer);
[1021]132          else
[1232]133            rankRecvNotLeader.push_back(remain + rank / clientByServer);
[595]134        }
135      }
[300]136    }
137
[512]138    /*!
139    In case of attached mode, the current context must be reset to context for client
140    \param [in] event Event sent to server
141    */
[300]142    void CContextClient::sendEvent(CEventClient& event)
143    {
[731]144      list<int> ranks = event.getRanks();
[2130]145      info(100)<<"Event "<<timeLine<<" of context "<<context_->getId()<<endl ;
[1377]146      if (CXios::checkEventSync)
147      {
[2189]148        int typeId, classId, typeId_in, classId_in;
149        long long timeLine_out;
150        long long timeLine_in( timeLine );
[1377]151        typeId_in=event.getTypeId() ;
152        classId_in=event.getClassId() ;
[1475]153//        MPI_Allreduce(&timeLine,&timeLine_out, 1, MPI_UINT64_T, MPI_SUM, intraComm) ; // MPI_UINT64_T standardized by MPI 3
[2189]154        MPI_Allreduce(&timeLine_in,&timeLine_out, 1, MPI_LONG_LONG_INT, MPI_SUM, intraComm) ; 
[1639]155        MPI_Allreduce(&typeId_in,&typeId, 1, MPI_INT, MPI_SUM, intraComm) ;
156        MPI_Allreduce(&classId_in,&classId, 1, MPI_INT, MPI_SUM, intraComm) ;
[1377]157        if (typeId/clientSize!=event.getTypeId() || classId/clientSize!=event.getClassId() || timeLine_out/clientSize!=timeLine)
158        {
159           ERROR("void CContextClient::sendEvent(CEventClient& event)",
[2229]160               << "Event are not coherent between client for timeline = "<<timeLine);
[1377]161        }
[2229]162       
163        vector<int> servers(serverSize,0) ;
164        auto ranks=event.getRanks() ;
165        for(auto& rank : ranks) servers[rank]=1 ;
166        MPI_Allreduce(MPI_IN_PLACE, servers.data(), serverSize,MPI_INT,MPI_SUM,intraComm) ;
167        ostringstream osstr ;
168        for(int i=0;i<serverSize;i++)  if (servers[i]==0) osstr<<i<<" , " ;
169        if (!osstr.str().empty())
170        {
171          ERROR("void CContextClient::sendEvent(CEventClient& event)",
172                 <<" Some servers will not receive the message for timeline = "<<timeLine<<endl
173                 <<"Servers are : "<<osstr.str()) ;
174        }
175
176
[1377]177      }
178
[595]179      if (!event.isEmpty())
[300]180      {
[731]181        list<int> sizes = event.getSizes();
[300]182
[1757]183         // We force the getBuffers call to be non-blocking on classical servers
[1054]184        list<CBufferOut*> buffList;
[1757]185        getBuffers(timeLine, ranks, sizes, buffList) ;
[509]186
[1757]187        event.send(timeLine, sizes, buffList);
188       
189        //for (auto itRank = ranks.begin(); itRank != ranks.end(); itRank++) buffers[*itRank]->infoBuffer() ;
[731]190
[1757]191        unlockBuffers(ranks) ;
[2130]192        info(100)<<"Event "<<timeLine<<" of context "<<context_->getId()<<"  sent"<<endl ;
[1757]193         
194        checkBuffers(ranks);
[300]195      }
[1761]196     
197      if (isAttachedModeEnabled()) // couldBuffer is always true in attached mode
198      {
[2130]199        while (checkBuffers(ranks)) context_->globalEventLoop() ;
200     
201        CXios::getDaemonsManager()->scheduleContext(hashId_) ;
202        while (CXios::getDaemonsManager()->isScheduledContext(hashId_)) context_->globalEventLoop() ;
[1761]203      }
204     
[1054]205      timeLine++;
206    }
207
208    /*!
[512]209    If client is also server (attached mode), after sending event, it should process right away
210    the incoming event.
211    \param [in] ranks list rank of server connected this client
212    */
[300]213    void CContextClient::waitEvent(list<int>& ranks)
214    {
[1761]215      while (checkBuffers(ranks))
216      {
[2130]217        context_->eventLoop() ;
[1761]218      }
219
220      MPI_Request req ;
221      MPI_Status status ;
222
223      MPI_Ibarrier(intraComm,&req) ;
224      int flag=false ;
225
226      do 
227      {
228        CXios::getDaemonsManager()->eventLoop() ;
229        MPI_Test(&req,&flag,&status) ;
230      } while (!flag) ;
231
232
233    }
234
235
236    void CContextClient::waitEvent_old(list<int>& ranks)
237    {
[595]238      parentServer->server->setPendingEvent();
239      while (checkBuffers(ranks))
[300]240      {
[595]241        parentServer->server->listen();
242        parentServer->server->checkPendingRequest();
[300]243      }
[386]244
[595]245      while (parentServer->server->hasPendingEvent())
[386]246      {
[595]247       parentServer->server->eventLoop();
[386]248      }
[300]249    }
250
[512]251    /*!
[1054]252     * Get buffers for each connection to the servers. This function blocks until there is enough room in the buffers unless
253     * it is explicitly requested to be non-blocking.
254     *
[1757]255     *
256     * \param [in] timeLine time line of the event which will be sent to servers
[1054]257     * \param [in] serverList list of rank of connected server
258     * \param [in] sizeList size of message corresponding to each connection
259     * \param [out] retBuffers list of buffers that can be used to store an event
260     * \param [in] nonBlocking whether this function should be non-blocking
261     * \return whether the already allocated buffers could be used
[512]262    */
[1757]263    bool CContextClient::getBuffers(const size_t timeLine, const list<int>& serverList, const list<int>& sizeList, list<CBufferOut*>& retBuffers,
[1071]264                                    bool nonBlocking /*= false*/)
[300]265    {
[1054]266      list<int>::const_iterator itServer, itSize;
[595]267      list<CClientBuffer*> bufferList;
[1054]268      map<int,CClientBuffer*>::const_iterator it;
[595]269      list<CClientBuffer*>::iterator itBuffer;
[884]270      bool areBuffersFree;
[300]271
[595]272      for (itServer = serverList.begin(); itServer != serverList.end(); itServer++)
[300]273      {
[595]274        it = buffers.find(*itServer);
275        if (it == buffers.end())
[300]276        {
[595]277          newBuffer(*itServer);
278          it = buffers.find(*itServer);
[509]279        }
[595]280        bufferList.push_back(it->second);
[300]281      }
[347]282
283      CTimer::get("Blocking time").resume();
[884]284      do
[300]285      {
[884]286        areBuffersFree = true;
[595]287        for (itBuffer = bufferList.begin(), itSize = sizeList.begin(); itBuffer != bufferList.end(); itBuffer++, itSize++)
[1757]288        {
[884]289          areBuffersFree &= (*itBuffer)->isBufferFree(*itSize);
[1757]290        }
[884]291
292        if (!areBuffersFree)
[300]293        {
[1757]294          for (itBuffer = bufferList.begin(); itBuffer != bufferList.end(); itBuffer++) (*itBuffer)->unlockBuffer();
[884]295          checkBuffers();
[2130]296/*         
[1761]297          context->server->listen();
298
299          if (context->serverPrimServer.size()>0)
[1130]300          {
[1757]301            for (int i = 0; i < context->serverPrimServer.size(); ++i)  context->serverPrimServer[i]->listen();
[1764]302 //ym           CServer::contextEventLoop(false) ; // avoid dead-lock at finalize...
303            context->globalEventLoop() ;
[1130]304          }
[2130]305*/
306           context_->globalEventLoop() ;
307        }
[1130]308
[1054]309      } while (!areBuffersFree && !nonBlocking);
[347]310      CTimer::get("Blocking time").suspend();
311
[1054]312      if (areBuffersFree)
[300]313      {
[1054]314        for (itBuffer = bufferList.begin(), itSize = sizeList.begin(); itBuffer != bufferList.end(); itBuffer++, itSize++)
[1757]315          retBuffers.push_back((*itBuffer)->getBuffer(timeLine, *itSize));
[300]316      }
[1054]317      return areBuffersFree;
[300]318   }
[509]319
[512]320   /*!
321   Make a new buffer for a certain connection to server with specific rank
322   \param [in] rank rank of connected server
323   */
[300]324   void CContextClient::newBuffer(int rank)
325   {
[1201]326      if (!mapBufferSize_.count(rank))
327      {
328        error(0) << "WARNING: Unexpected request for buffer to communicate with server " << rank << std::endl;
329        mapBufferSize_[rank] = CXios::minBufferSize;
330        maxEventSizes[rank] = CXios::minBufferSize;
331      }
[1757]332     
333      vector<MPI_Win> Wins(2,MPI_WIN_NULL) ;
334      if (!isAttachedModeEnabled()) Wins=windows[rank] ;
335 
336      CClientBuffer* buffer = buffers[rank] = new CClientBuffer(interComm, Wins, clientRank, rank, mapBufferSize_[rank], maxEventSizes[rank]);
[2130]337      if (isGrowableBuffer_) buffer->setGrowableBuffer(1.2) ;
338      else buffer->fixBuffer() ;
[1201]339      // Notify the server
[2130]340      CBufferOut* bufOut = buffer->getBuffer(0, 4*sizeof(MPI_Aint));
341      MPI_Aint sendBuff[4] ;
342      sendBuff[0]=hashId_;
343      sendBuff[1]=mapBufferSize_[rank];
344      sendBuff[2]=buffers[rank]->getWinAddress(0); 
345      sendBuff[3]=buffers[rank]->getWinAddress(1); 
[1757]346      info(100)<<"CContextClient::newBuffer : rank "<<rank<<" winAdress[0] "<<buffers[rank]->getWinAddress(0)<<" winAdress[1] "<<buffers[rank]->getWinAddress(1)<<endl;
[2130]347      bufOut->put(sendBuff, 4); 
[1757]348      buffer->checkBuffer(true);
349
[509]350   }
[300]351
[512]352   /*!
353   Verify state of buffers. Buffer is under pending state if there is no message on it
354   \return state of buffers, pending(true), ready(false)
355   */
[300]356   bool CContextClient::checkBuffers(void)
357   {
[595]358      map<int,CClientBuffer*>::iterator itBuff;
359      bool pending = false;
[1130]360      for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)
[1757]361        pending |= itBuff->second->checkBuffer(!pureOneSided);
[595]362      return pending;
[509]363   }
[300]364
[512]365   //! Release all buffers
[1071]366   void CContextClient::releaseBuffers()
[300]367   {
[595]368      map<int,CClientBuffer*>::iterator itBuff;
[1077]369      for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)
[1139]370      {
[1757]371         delete itBuff->second;
[1139]372      }
[1077]373      buffers.clear();
[1757]374
[2240]375// don't know when release windows
[1757]376
377      if (!isAttachedModeEnabled())
378      { 
379        for(int rank=0; rank<serverSize; rank++)
380        {
381          MPI_Win_free(&windows[rank][0]);
382          MPI_Win_free(&windows[rank][1]);
383        }
384      } 
[2240]385
[509]386   }
[1761]387
[1757]388     
389  /*!
390   Lock the buffers for one sided communications
391   \param [in] ranks list rank of server to which client connects to
392   */
393   void CContextClient::lockBuffers(list<int>& ranks)
394   {
395      list<int>::iterator it;
396      for (it = ranks.begin(); it != ranks.end(); it++) buffers[*it]->lockBuffer();
397   }
[300]398
[1757]399  /*!
400   Unlock the buffers for one sided communications
401   \param [in] ranks list rank of server to which client connects to
402   */
403   void CContextClient::unlockBuffers(list<int>& ranks)
404   {
405      list<int>::iterator it;
406      for (it = ranks.begin(); it != ranks.end(); it++) buffers[*it]->unlockBuffer();
407   }
408     
[512]409   /*!
410   Verify state of buffers corresponding to a connection
411   \param [in] ranks list rank of server to which client connects to
412   \return state of buffers, pending(true), ready(false)
413   */
[300]414   bool CContextClient::checkBuffers(list<int>& ranks)
415   {
[595]416      list<int>::iterator it;
417      bool pending = false;
[1757]418      for (it = ranks.begin(); it != ranks.end(); it++) pending |= buffers[*it]->checkBuffer(!pureOneSided);
[595]419      return pending;
[509]420   }
[300]421
[512]422   /*!
[917]423    * Set the buffer size for each connection. Warning: This function is collective.
424    *
425    * \param [in] mapSize maps the rank of the connected servers to the size of the correspoinding buffer
426    * \param [in] maxEventSize maps the rank of the connected servers to the size of the biggest event
[512]427   */
[2130]428   void CContextClient::setBufferSize(const std::map<int,StdSize>& mapSize)
[509]429   {
[2176]430     for(auto& it : mapSize) 
431      buffers[it.first]->fixBufferSize(std::max(CXios::minBufferSize*1.0,std::min(it.second*CXios::bufferSizeFactor*1.01,CXios::maxBufferSize*1.0)));
[509]432   }
433
[1158]434  /*!
435  Get leading server in the group of connected server
436  \return ranks of leading servers
437  */
438  const std::list<int>& CContextClient::getRanksServerNotLeader(void) const
439  {
440    return ranksServerNotLeader;
441  }
[1021]442
[1158]443  /*!
444  Check if client connects to leading server
445  \return connected(true), not connected (false)
446  */
447  bool CContextClient::isServerNotLeader(void) const
448  {
449    return !ranksServerNotLeader.empty();
450  }
[1021]451
[595]452  /*!
453  Get leading server in the group of connected server
454  \return ranks of leading servers
455  */
456  const std::list<int>& CContextClient::getRanksServerLeader(void) const
457  {
458    return ranksServerLeader;
459  }
[509]460
[595]461  /*!
462  Check if client connects to leading server
463  \return connected(true), not connected (false)
464  */
465  bool CContextClient::isServerLeader(void) const
466  {
467    return !ranksServerLeader.empty();
468  }
[300]469
[512]470   /*!
[1130]471   * Finalize context client and do some reports. Function is non-blocking.
[512]472   */
[1130]473  void CContextClient::finalize(void)
[1054]474  {
475    map<int,CClientBuffer*>::iterator itBuff;
[1757]476    std::list<int>::iterator ItServerLeader; 
477   
[1054]478    bool stop = false;
[731]479
[1757]480    int* nbServerConnectionLocal  = new int[serverSize] ;
481    int* nbServerConnectionGlobal  = new int[serverSize] ;
482    for(int i=0;i<serverSize;++i) nbServerConnectionLocal[i]=0 ;
483    for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)  nbServerConnectionLocal[itBuff->first]=1 ;
484    for (ItServerLeader = ranksServerLeader.begin(); ItServerLeader != ranksServerLeader.end(); ItServerLeader++)  nbServerConnectionLocal[*ItServerLeader]=1 ;
485   
486    MPI_Allreduce(nbServerConnectionLocal, nbServerConnectionGlobal, serverSize, MPI_INT, MPI_SUM, intraComm);
487   
488    CEventClient event(CContext::GetType(), CContext::EVENT_ID_CONTEXT_FINALIZE);
489    CMessage msg;
[509]490
[1757]491    for (int i=0;i<serverSize;++i) if (nbServerConnectionLocal[i]==1) event.push(i, nbServerConnectionGlobal[i], msg) ;
492    sendEvent(event);
493
494    delete[] nbServerConnectionLocal ;
495    delete[] nbServerConnectionGlobal ;
[509]496
[1765]497
[1054]498    CTimer::get("Blocking time").resume();
[1757]499    checkBuffers();
[1054]500    CTimer::get("Blocking time").suspend();
501
502    std::map<int,StdSize>::const_iterator itbMap = mapBufferSize_.begin(),
503                                          iteMap = mapBufferSize_.end(), itMap;
[1071]504
[1054]505    StdSize totalBuf = 0;
506    for (itMap = itbMap; itMap != iteMap; ++itMap)
507    {
[2130]508      report(10) << " Memory report : Context <" << context_->getId() << "> : client side : memory used for buffer of each connection to server" << endl
[1054]509                 << "  +) To server with rank " << itMap->first << " : " << itMap->second << " bytes " << endl;
510      totalBuf += itMap->second;
511    }
[2130]512    report(0) << " Memory report : Context <" << context_->getId() << "> : client side : total memory used for buffer " << totalBuf << " bytes" << endl;
[1054]513
514  }
[1130]515
[1139]516
517  /*!
518  */
[1130]519  bool CContextClient::havePendingRequests(void)
520  {
521    bool pending = false;
522    map<int,CClientBuffer*>::iterator itBuff;
523    for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)
524      pending |= itBuff->second->hasPendingRequest();
525    return pending;
526  }
[1757]527 
528  bool CContextClient::isNotifiedFinalized(void)
529  {
530    if (isAttachedModeEnabled()) return true ;
[1130]531
[1757]532    bool finalized = true;
533    map<int,CClientBuffer*>::iterator itBuff;
534    for (itBuff = buffers.begin(); itBuff != buffers.end(); itBuff++)
535      finalized &= itBuff->second->isNotifiedFinalized();
536    return finalized;
537  }
[1130]538
[509]539}
Note: See TracBrowser for help on using the repository browser.