[492] | 1 | #ifndef __EVENT_SCHEDULER_HPP__ |
---|
| 2 | #define __EVENT_SCHEDULER_HPP__ |
---|
| 3 | |
---|
[591] | 4 | #include "xios_spl.hpp" |
---|
[492] | 5 | #include "mpi.hpp" |
---|
| 6 | |
---|
| 7 | namespace xios |
---|
| 8 | { |
---|
| 9 | |
---|
| 10 | //! Event scheduling class. An instance of this class is used to order the event providing from different context to avoid dead lock. |
---|
| 11 | /*! |
---|
| 12 | * Event are ordered in a same context using the timeLine id, so each server will process the same event. But between different |
---|
| 13 | * context, events are not scheduled and servers may choose to process different events and deadlock or MPI crash may occurs if |
---|
| 14 | * collective MPI communication are involved by the events. |
---|
| 15 | * This class solve the problem by scheduling the event and choose which event must be process by each server to insure correct |
---|
| 16 | * synchronisation. Information is send by asynchronous MPI communication to the root process that order the different events |
---|
| 17 | * (First In First Out) and brodcast the information to the other servers. To avoid to much incoming communication for the root |
---|
| 18 | * process, and hierachical tree is used for communicating from a limited number of child processes to the parent. |
---|
| 19 | */ |
---|
| 20 | |
---|
| 21 | class CEventScheduler |
---|
| 22 | { |
---|
| 23 | public: |
---|
| 24 | //! Constructor |
---|
| 25 | /*! A new communicator is created by duplicate comm. The communicating tree hierarchy is created. |
---|
| 26 | * @param[in] comm : MPI communicator du duplicate for internal use |
---|
| 27 | */ |
---|
[1639] | 28 | CEventScheduler(const MPI_Comm& comm) ; |
---|
[492] | 29 | |
---|
| 30 | |
---|
| 31 | //! Destructor |
---|
| 32 | ~CEventScheduler() ; |
---|
| 33 | |
---|
| 34 | |
---|
| 35 | |
---|
| 36 | //! public interface for registring an event from the server |
---|
| 37 | /*! |
---|
| 38 | * @param[in] timeLine : Time line id of the event |
---|
| 39 | * @param[in] contextHashId : Hashed id of the context |
---|
| 40 | */ |
---|
| 41 | void registerEvent(const size_t timeLine, const size_t contextHashId) ; |
---|
| 42 | |
---|
| 43 | |
---|
| 44 | |
---|
| 45 | //! public interface for query if the event defined by timeLine and hashId is sheduled next |
---|
| 46 | /*! |
---|
| 47 | * @param[in] timeLine : Time line id of the event |
---|
| 48 | * @param[in] contextHasId : Hashed id of the context |
---|
| 49 | * @return : boolean value, true is the event is scheduled next |
---|
| 50 | * |
---|
| 51 | * If the event is scheduled next, it is remove from the `eventStack` queue list |
---|
| 52 | */ |
---|
| 53 | bool queryEvent(const size_t timeLine, const size_t contextHashId) ; |
---|
| 54 | |
---|
| 55 | |
---|
| 56 | //! Public interface to give the hand to the instance to check pending or incoming message. |
---|
| 57 | /*! |
---|
| 58 | * Must be called periodicaly. Call `checkParentRequest` and `checkChildRequest` private method. |
---|
| 59 | */ |
---|
| 60 | void checkEvent(void) ; |
---|
| 61 | |
---|
| 62 | private: |
---|
| 63 | |
---|
| 64 | |
---|
| 65 | //! Send an event to the parent of level `lev+1` |
---|
| 66 | /*! |
---|
| 67 | * @param[in] timeLine : Time line id of the event |
---|
| 68 | * @param[in] contextHasId : Hashed id of the context |
---|
| 69 | * @param[in] lev : actual level of the child in the hierarchy |
---|
| 70 | * The event is sent by an asynchrounous MPI_ISend |
---|
| 71 | */ |
---|
| 72 | void registerEvent(const size_t timeLine, const size_t contextHashId, const size_t lev) ; |
---|
| 73 | |
---|
| 74 | |
---|
| 75 | |
---|
| 76 | //! Children side. Check potential incoming message and if pending request are completed |
---|
| 77 | /*! |
---|
| 78 | * - Check by `MPI_Test` if pending request sent to parents are complete. |
---|
| 79 | * - Probe incoming message from parent by using `MPI_Probe`. If yes, post an asynchronous reception by `MPI_IRecv` |
---|
| 80 | * - Check by `MPI_Test` if pending received requests are complete. if yes : |
---|
| 81 | * + Broadcast the event to the childrens if is also a parent |
---|
| 82 | * + Otherwise : push the incomming event in the `eventStack` queue. |
---|
| 83 | */ |
---|
| 84 | void checkParentRequest(void) ; |
---|
| 85 | |
---|
| 86 | |
---|
| 87 | |
---|
| 88 | //! Parent side. Check potential incoming message and if pending request are completed |
---|
| 89 | /*! |
---|
| 90 | * - Probe incoming message from chidren by using `MPI_Probe`. If yes, post an asynchronous reception by `MPI_IRecv`. |
---|
| 91 | * - Check pending received event request from children using `MPI_Probe`. If and event is received, it is incerted in the |
---|
| 92 | * map `recvEvent` which is increased by 1. If the number of request received from children for this event is equal to the number |
---|
| 93 | * of children then : |
---|
| 94 | * + if the event level is 0, bcast the event to the children. |
---|
| 95 | * + else send the event to the parent. |
---|
| 96 | * - Check pending sent event request to children using `MPI_TEST` and if complete release the corresponding buffer |
---|
| 97 | */ |
---|
| 98 | void checkChildRequest(void) ; |
---|
| 99 | |
---|
| 100 | |
---|
| 101 | |
---|
| 102 | //! Parent side. Broadcast a received event from the parent to the children. |
---|
| 103 | /*! |
---|
| 104 | * @param[in] timeLine : Time line id of the event |
---|
| 105 | * @param[in] contextHasId : Hashed id of the context |
---|
| 106 | * @param[in] lev : actual level of the child in the hierarchy |
---|
| 107 | * Asynchronus MPI_ISend is used. |
---|
| 108 | */ |
---|
| 109 | void bcastEvent(const size_t timeLine, const size_t contextHashId, const size_t lev) ; |
---|
| 110 | |
---|
| 111 | |
---|
| 112 | |
---|
| 113 | |
---|
| 114 | //! Structure defining an event, composed of the timeLine, the context hashId and the hierachical level of the communication. |
---|
| 115 | struct SEvent |
---|
| 116 | { |
---|
| 117 | size_t timeLine ; /*!< Time line id of the event in the context */ |
---|
| 118 | size_t hashId ; /*!< hassh id of the context */ |
---|
| 119 | size_t level ; /*!<hierarchical level of the communication*/ |
---|
| 120 | |
---|
| 121 | //! Definition of the == operator : needed to order the object in a map container |
---|
| 122 | /*! |
---|
| 123 | @param[in] e : object to compare with |
---|
| 124 | @return : boolean result of the comparison |
---|
| 125 | */ |
---|
| 126 | bool operator==(const SEvent& e) const |
---|
| 127 | { |
---|
| 128 | if (timeLine == e.timeLine && hashId == e.hashId && level==e.level) return true ; |
---|
| 129 | else return false ; |
---|
| 130 | } ; |
---|
| 131 | |
---|
| 132 | |
---|
| 133 | //! Definition of the < operator : needed to order the object in a map container |
---|
| 134 | /*! |
---|
| 135 | @param[in] e : object to compare with |
---|
| 136 | @return : boolean result of the comparison |
---|
| 137 | */ |
---|
| 138 | |
---|
| 139 | bool operator<(const SEvent& e) const |
---|
| 140 | { |
---|
| 141 | if (timeLine < e.timeLine) return true ; |
---|
| 142 | else if (timeLine == e.timeLine && hashId < e.hashId) return true ; |
---|
| 143 | else if (timeLine == e.timeLine && hashId == e.hashId && level<e.level) return true ; |
---|
| 144 | else return false ; |
---|
| 145 | } ; |
---|
| 146 | } ; |
---|
| 147 | |
---|
| 148 | |
---|
| 149 | //! Pending request struture. It keep send or receive buffer from asynchronous communication while the request is not complete. |
---|
| 150 | struct SPendingRequest |
---|
| 151 | { |
---|
| 152 | size_t buffer[3] ; /*!< communication buffer : timeLine, hashId, level */ |
---|
[1639] | 153 | MPI_Request request ; /*!< pending MPI request */ |
---|
[492] | 154 | } ; |
---|
| 155 | |
---|
[1639] | 156 | MPI_Comm communicator ; /*!< Internal MPI communicator */ |
---|
[492] | 157 | int mpiRank ; /*!< Rank in the communicator */ |
---|
| 158 | int mpiSize ; /*!< Size of the communicator */ |
---|
| 159 | |
---|
| 160 | queue< pair<size_t, size_t> > eventStack ; |
---|
| 161 | queue<SPendingRequest* > pendingSentParentRequest ; /*!< Pending request sent to parent */ |
---|
| 162 | queue<SPendingRequest*> pendingRecvParentRequest ; /*!< Pending request recv from parent */ |
---|
| 163 | list<SPendingRequest* > pendingRecvChildRequest ; /*!< Pending request recv from child */ |
---|
| 164 | list<SPendingRequest*> pendingSentChildRequest ; /*!< Pending request sent to child */ |
---|
| 165 | map< SEvent, int > recvEvent ; /*!< list of event received from children. Contains the currnet number children that have already post the same event */ |
---|
| 166 | |
---|
| 167 | |
---|
| 168 | int level ; /*!< Number of hierachical level for communication */ |
---|
| 169 | vector<int> parent ; /*!< Parent rank for each level */ |
---|
| 170 | vector<vector<int> > child ; /*!< List of child rank for each level */ |
---|
| 171 | vector<int> nbChild ; /*!< Number of child for each level */ |
---|
| 172 | |
---|
| 173 | } ; |
---|
| 174 | } |
---|
| 175 | |
---|
| 176 | #endif |
---|