1 | #!/usr/bin/env python |
---|
2 | # -*- coding: ISO-8859-1 -*- |
---|
3 | |
---|
4 | ################################## |
---|
5 | # @program smon |
---|
6 | # @description simulation monitor |
---|
7 | # @copyright Copyright â(c)2009 Centre National de la Recherche Scientifique CNRS. |
---|
8 | # All Rights Reservedâ |
---|
9 | # @svn_file $Id: analyzer 2545 2013-02-01 09:58:10Z jripsl $ |
---|
10 | # @version $Rev: 2545 $ |
---|
11 | # @lastrevision $Date: 2013-02-01 10:58:10 +0100 (Fri, 01 Feb 2013) $ |
---|
12 | # @license CeCILL (http://dods.ipsl.jussieu.fr/jripsl/smon/LICENSE) |
---|
13 | ################################## |
---|
14 | |
---|
15 | import sys |
---|
16 | import signal |
---|
17 | import traceback |
---|
18 | import smtplib |
---|
19 | from email.mime.text import MIMEText |
---|
20 | import time |
---|
21 | import datetime |
---|
22 | |
---|
23 | |
---|
24 | # line below is to include "smon" package in the search path |
---|
25 | sys.path.append("/home/jripsl/snapshot/Monitoring") |
---|
26 | |
---|
27 | import smon.repo_io as repo_io |
---|
28 | import smon.types as types |
---|
29 | |
---|
30 | |
---|
31 | |
---|
32 | class CheckList(): |
---|
33 | max_time_between_msg=20 # unit => seconds |
---|
34 | |
---|
35 | @classmethod |
---|
36 | def msg_timeout(cls,message): |
---|
37 | |
---|
38 | # get current epoch |
---|
39 | current_epoch=time.time() |
---|
40 | |
---|
41 | # get msg epoch |
---|
42 | msg_time=time.strptime(str(message.crea_date), "%Y-%m-%d %H:%M:%S.%f") |
---|
43 | msg_epoch=time.mktime(msg_time) |
---|
44 | |
---|
45 | diff = current_epoch - msg_epoch |
---|
46 | |
---|
47 | # debug |
---|
48 | #print "cur=%i,ms=%s"%(current_epoch,message.crea_date) |
---|
49 | |
---|
50 | # debug |
---|
51 | #print "diff=%i"%int(diff) |
---|
52 | |
---|
53 | if diff>cls.max_time_between_msg: |
---|
54 | |
---|
55 | return True |
---|
56 | else: |
---|
57 | return False |
---|
58 | |
---|
59 | @classmethod |
---|
60 | def C0001(cls): |
---|
61 | """ |
---|
62 | description |
---|
63 | check heartbeat (currently, heartbeat is implemented using simulation progress messages). |
---|
64 | if progress messages suddently stops, it is likely that the simulation was Killed or |
---|
65 | a Segfault occurs. In such case, we inform the other components (failover, prodiguer GUI..) by |
---|
66 | changing the simulation status |
---|
67 | """ |
---|
68 | |
---|
69 | |
---|
70 | for simulation in repo_io.get_running_simulations(): |
---|
71 | |
---|
72 | print "\nchecking heartbeat ('%s')"%simulation.name |
---|
73 | |
---|
74 | try: |
---|
75 | message=repo_io.retrieve_last_message(simulation) |
---|
76 | |
---|
77 | # debug |
---|
78 | #print "found" |
---|
79 | |
---|
80 | except types.MessageNotFoundException, e: |
---|
81 | # when we are here, it mean we are in the interval when a new simulation have just been inserted but the corresponding message have not been inserted yet |
---|
82 | |
---|
83 | print "no message found for simulation ('%s')"%simulation.name |
---|
84 | |
---|
85 | continue |
---|
86 | |
---|
87 | |
---|
88 | if cls.msg_timeout(message): |
---|
89 | |
---|
90 | simulation.status="error" |
---|
91 | |
---|
92 | repo_io.update_simulation_status(simulation) |
---|
93 | |
---|
94 | print "heartbeat NOK - simulation status set to 'error'\n" |
---|
95 | |
---|
96 | |
---|
97 | else: |
---|
98 | print "heartbeat OK\n" |
---|
99 | |
---|
100 | class Analyzer(): |
---|
101 | |
---|
102 | @classmethod |
---|
103 | def start(cls): |
---|
104 | repo_io.init() # open DB connection |
---|
105 | |
---|
106 | Analyzer.main() |
---|
107 | |
---|
108 | @classmethod |
---|
109 | def stop(cls): |
---|
110 | repo_io.free() # close DB connection |
---|
111 | |
---|
112 | @classmethod |
---|
113 | def main(self): |
---|
114 | |
---|
115 | """ |
---|
116 | # parse args |
---|
117 | parser = argparse.ArgumentParser(prog='analyzer') |
---|
118 | parser.add_argument('-v', dest='verbose',required=False,action='store_true') |
---|
119 | args = parser.parse_args() |
---|
120 | |
---|
121 | # check |
---|
122 | if not os.path.exists(SMON.smon_home): |
---|
123 | sys.exit(1) |
---|
124 | |
---|
125 | SMON.init_singleton() |
---|
126 | """ |
---|
127 | |
---|
128 | print ' [*] Analyzer running. To exit press CTRL+C' |
---|
129 | |
---|
130 | while True: |
---|
131 | |
---|
132 | #print "checking simulations heartbeats" |
---|
133 | |
---|
134 | CheckList.C0001() |
---|
135 | |
---|
136 | |
---|
137 | time.sleep(3) |
---|
138 | |
---|
139 | """ |
---|
140 | SMON.free_singleton() |
---|
141 | """ |
---|
142 | |
---|
143 | def signal_handler(signal, frame): |
---|
144 | print 'You pressed Ctrl+C!' |
---|
145 | |
---|
146 | Analyzer.stop() |
---|
147 | |
---|
148 | sys.exit(0) |
---|
149 | |
---|
150 | if __name__ == '__main__': |
---|
151 | |
---|
152 | signal.signal(signal.SIGINT, signal_handler) |
---|
153 | |
---|
154 | try: |
---|
155 | Analyzer.start() |
---|
156 | |
---|
157 | sys.exit(0) |
---|
158 | |
---|
159 | except Exception, e: |
---|
160 | |
---|
161 | traceback.print_exc() |
---|
162 | |
---|
163 | sys.exit(1) |
---|