[492] | 1 | #ifndef __EVENT_SCHEDULER_HPP__ |
---|
| 2 | #define __EVENT_SCHEDULER_HPP__ |
---|
| 3 | |
---|
[591] | 4 | #include "xios_spl.hpp" |
---|
[492] | 5 | #include "mpi.hpp" |
---|
| 6 | |
---|
| 7 | namespace xios |
---|
| 8 | { |
---|
| 9 | |
---|
| 10 | //! Event scheduling class. An instance of this class is used to order the event providing from different context to avoid dead lock. |
---|
| 11 | /*! |
---|
| 12 | * Event are ordered in a same context using the timeLine id, so each server will process the same event. But between different |
---|
| 13 | * context, events are not scheduled and servers may choose to process different events and deadlock or MPI crash may occurs if |
---|
| 14 | * collective MPI communication are involved by the events. |
---|
| 15 | * This class solve the problem by scheduling the event and choose which event must be process by each server to insure correct |
---|
| 16 | * synchronisation. Information is send by asynchronous MPI communication to the root process that order the different events |
---|
| 17 | * (First In First Out) and brodcast the information to the other servers. To avoid to much incoming communication for the root |
---|
| 18 | * process, and hierachical tree is used for communicating from a limited number of child processes to the parent. |
---|
| 19 | */ |
---|
| 20 | |
---|
| 21 | class CEventScheduler |
---|
| 22 | { |
---|
| 23 | public: |
---|
| 24 | //! Constructor |
---|
| 25 | /*! A new communicator is created by duplicate comm. The communicating tree hierarchy is created. |
---|
| 26 | * @param[in] comm : MPI communicator du duplicate for internal use |
---|
| 27 | */ |
---|
[1639] | 28 | CEventScheduler(const MPI_Comm& comm) ; |
---|
[2522] | 29 | CEventScheduler(const MPI_Comm& comm, size_t schedulerLevel) ; |
---|
[492] | 30 | |
---|
| 31 | //! Destructor |
---|
| 32 | ~CEventScheduler() ; |
---|
| 33 | |
---|
| 34 | |
---|
| 35 | |
---|
| 36 | //! public interface for registring an event from the server |
---|
| 37 | /*! |
---|
| 38 | * @param[in] timeLine : Time line id of the event |
---|
| 39 | * @param[in] contextHashId : Hashed id of the context |
---|
| 40 | */ |
---|
| 41 | void registerEvent(const size_t timeLine, const size_t contextHashId) ; |
---|
[2522] | 42 | |
---|
| 43 | private: |
---|
| 44 | CEventScheduler* getBaseScheduler(void) { if (childScheduler_== nullptr) return this; else return childScheduler_->getBaseScheduler();} |
---|
[492] | 45 | |
---|
[2522] | 46 | public: |
---|
[492] | 47 | //! public interface for query if the event defined by timeLine and hashId is sheduled next |
---|
| 48 | /*! |
---|
| 49 | * @param[in] timeLine : Time line id of the event |
---|
| 50 | * @param[in] contextHasId : Hashed id of the context |
---|
| 51 | * @return : boolean value, true is the event is scheduled next |
---|
| 52 | * |
---|
| 53 | * If the event is scheduled next, it is remove from the `eventStack` queue list |
---|
| 54 | */ |
---|
[2522] | 55 | bool queryEvent(const size_t timeLine, const size_t contextHashId) { return getBaseScheduler()->queryEvent_(timeLine, contextHashId); } |
---|
| 56 | bool queryEvent_(const size_t timeLine, const size_t contextHashId) ; |
---|
| 57 | void popEvent() { getBaseScheduler()->popEvent_() ; } |
---|
| 58 | void popEvent_() { eventStack_.pop() ; } |
---|
| 59 | bool isRoot(void) { return parent_[0]==mpiRank_ ;} |
---|
| 60 | void setParentScheduler(shared_ptr<CEventScheduler> parentScheduler) { parentScheduler_ = parentScheduler ;} |
---|
| 61 | void setChildScheduler(shared_ptr<CEventScheduler> childScheduler) { childScheduler_ = childScheduler ;} |
---|
| 62 | void splitScheduler(const MPI_Comm& splittedComm, shared_ptr<CEventScheduler>& parent, shared_ptr<CEventScheduler>& child) ; |
---|
[492] | 63 | |
---|
| 64 | //! Public interface to give the hand to the instance to check pending or incoming message. |
---|
| 65 | /*! |
---|
| 66 | * Must be called periodicaly. Call `checkParentRequest` and `checkChildRequest` private method. |
---|
| 67 | */ |
---|
[2522] | 68 | void checkEvent(void) { getBaseScheduler()->checkEvent_(); } |
---|
| 69 | void checkEvent_(void) ; |
---|
[492] | 70 | |
---|
| 71 | private: |
---|
[2522] | 72 | void initialize(const MPI_Comm& comm) ; |
---|
[2569] | 73 | void cleanSplitSchedulers(); |
---|
[2522] | 74 | |
---|
[492] | 75 | //! Send an event to the parent of level `lev+1` |
---|
| 76 | /*! |
---|
| 77 | * @param[in] timeLine : Time line id of the event |
---|
| 78 | * @param[in] contextHasId : Hashed id of the context |
---|
| 79 | * @param[in] lev : actual level of the child in the hierarchy |
---|
| 80 | * The event is sent by an asynchrounous MPI_ISend |
---|
| 81 | */ |
---|
[2522] | 82 | |
---|
| 83 | void registerEvent(const size_t timeLine, const size_t contextHashId, const size_t schedulerLevel) ; |
---|
| 84 | void registerEvent(const size_t timeLine, const size_t contextHashId, const size_t schedulerLevel, const size_t lev) ; |
---|
[492] | 85 | |
---|
| 86 | |
---|
| 87 | |
---|
| 88 | //! Children side. Check potential incoming message and if pending request are completed |
---|
| 89 | /*! |
---|
| 90 | * - Check by `MPI_Test` if pending request sent to parents are complete. |
---|
| 91 | * - Probe incoming message from parent by using `MPI_Probe`. If yes, post an asynchronous reception by `MPI_IRecv` |
---|
| 92 | * - Check by `MPI_Test` if pending received requests are complete. if yes : |
---|
| 93 | * + Broadcast the event to the childrens if is also a parent |
---|
| 94 | * + Otherwise : push the incomming event in the `eventStack` queue. |
---|
| 95 | */ |
---|
| 96 | void checkParentRequest(void) ; |
---|
| 97 | |
---|
| 98 | |
---|
| 99 | |
---|
| 100 | //! Parent side. Check potential incoming message and if pending request are completed |
---|
| 101 | /*! |
---|
| 102 | * - Probe incoming message from chidren by using `MPI_Probe`. If yes, post an asynchronous reception by `MPI_IRecv`. |
---|
| 103 | * - Check pending received event request from children using `MPI_Probe`. If and event is received, it is incerted in the |
---|
| 104 | * map `recvEvent` which is increased by 1. If the number of request received from children for this event is equal to the number |
---|
| 105 | * of children then : |
---|
| 106 | * + if the event level is 0, bcast the event to the children. |
---|
| 107 | * + else send the event to the parent. |
---|
| 108 | * - Check pending sent event request to children using `MPI_TEST` and if complete release the corresponding buffer |
---|
| 109 | */ |
---|
| 110 | void checkChildRequest(void) ; |
---|
| 111 | |
---|
| 112 | |
---|
| 113 | |
---|
| 114 | //! Parent side. Broadcast a received event from the parent to the children. |
---|
| 115 | /*! |
---|
| 116 | * @param[in] timeLine : Time line id of the event |
---|
| 117 | * @param[in] contextHasId : Hashed id of the context |
---|
| 118 | * @param[in] lev : actual level of the child in the hierarchy |
---|
| 119 | * Asynchronus MPI_ISend is used. |
---|
| 120 | */ |
---|
[2522] | 121 | void bcastEvent(const size_t timeLine, const size_t contextHashId, const size_t schedulerLevel ,const size_t lev) ; |
---|
[492] | 122 | |
---|
| 123 | |
---|
| 124 | |
---|
| 125 | |
---|
| 126 | //! Structure defining an event, composed of the timeLine, the context hashId and the hierachical level of the communication. |
---|
| 127 | struct SEvent |
---|
| 128 | { |
---|
| 129 | size_t timeLine ; /*!< Time line id of the event in the context */ |
---|
| 130 | size_t hashId ; /*!< hassh id of the context */ |
---|
[2522] | 131 | size_t schedulerLevel ; /*!< hierarchical level of scherduler */ |
---|
[492] | 132 | size_t level ; /*!<hierarchical level of the communication*/ |
---|
| 133 | |
---|
| 134 | //! Definition of the == operator : needed to order the object in a map container |
---|
| 135 | /*! |
---|
| 136 | @param[in] e : object to compare with |
---|
| 137 | @return : boolean result of the comparison |
---|
| 138 | */ |
---|
| 139 | bool operator==(const SEvent& e) const |
---|
| 140 | { |
---|
[2522] | 141 | if (timeLine == e.timeLine && hashId == e.hashId && level==e.level && schedulerLevel==e.schedulerLevel) return true ; |
---|
[492] | 142 | else return false ; |
---|
| 143 | } ; |
---|
| 144 | |
---|
| 145 | |
---|
| 146 | //! Definition of the < operator : needed to order the object in a map container |
---|
| 147 | /*! |
---|
| 148 | @param[in] e : object to compare with |
---|
| 149 | @return : boolean result of the comparison |
---|
| 150 | */ |
---|
| 151 | |
---|
| 152 | bool operator<(const SEvent& e) const |
---|
| 153 | { |
---|
| 154 | if (timeLine < e.timeLine) return true ; |
---|
| 155 | else if (timeLine == e.timeLine && hashId < e.hashId) return true ; |
---|
[2522] | 156 | else if (timeLine == e.timeLine && hashId == e.hashId && schedulerLevel<e.schedulerLevel) return true ; |
---|
| 157 | else if (timeLine == e.timeLine && hashId == e.hashId && schedulerLevel==e.schedulerLevel && level<e.level) return true ; |
---|
[492] | 158 | else return false ; |
---|
| 159 | } ; |
---|
| 160 | } ; |
---|
[2522] | 161 | |
---|
[492] | 162 | //! Pending request struture. It keep send or receive buffer from asynchronous communication while the request is not complete. |
---|
| 163 | struct SPendingRequest |
---|
| 164 | { |
---|
[2522] | 165 | size_t buffer[4] ; /*!< communication buffer : timeLine, hashId, level */ |
---|
[1639] | 166 | MPI_Request request ; /*!< pending MPI request */ |
---|
[492] | 167 | } ; |
---|
| 168 | |
---|
[2518] | 169 | MPI_Comm communicator_ ; /*!< Internal MPI communicator */ |
---|
| 170 | int mpiRank_ ; /*!< Rank in the communicator */ |
---|
| 171 | int mpiSize_ ; /*!< Size of the communicator */ |
---|
[492] | 172 | |
---|
[2518] | 173 | queue< pair<size_t, size_t> > eventStack_ ; |
---|
| 174 | queue<SPendingRequest* > pendingSentParentRequest_ ; /*!< Pending request sent to parent */ |
---|
| 175 | queue<SPendingRequest*> pendingRecvParentRequest_ ; /*!< Pending request recv from parent */ |
---|
| 176 | list<SPendingRequest* > pendingRecvChildRequest_ ; /*!< Pending request recv from child */ |
---|
| 177 | list<SPendingRequest*> pendingSentChildRequest_ ; /*!< Pending request sent to child */ |
---|
| 178 | map< SEvent, int > recvEvent_ ; /*!< list of event received from children. Contains the currnet number children that have already post the same event */ |
---|
[492] | 179 | |
---|
| 180 | |
---|
[2518] | 181 | int level_ ; /*!< Number of hierachical level for communication */ |
---|
| 182 | vector<int> parent_ ; /*!< Parent rank for each level */ |
---|
| 183 | vector<vector<int> > child_ ; /*!< List of child rank for each level */ |
---|
| 184 | vector<int> nbChild_ ; /*!< Number of child for each level */ |
---|
[2522] | 185 | |
---|
| 186 | shared_ptr<CEventScheduler> parentScheduler_ ; |
---|
| 187 | shared_ptr<CEventScheduler> childScheduler_ ; |
---|
| 188 | bool hasParentScheduler_=false ; |
---|
| 189 | size_t schedulerLevel_ ; |
---|
[492] | 190 | |
---|
| 191 | } ; |
---|
| 192 | } |
---|
| 193 | |
---|
| 194 | #endif |
---|