1 | #!/usr/bin/env python |
---|
2 | # -*- coding: ISO-8859-1 -*- |
---|
3 | |
---|
4 | ################################## |
---|
5 | # @program smon |
---|
6 | # @description simulation monitor |
---|
7 | # @copyright Copyright â(c)2009 Centre National de la Recherche Scientifique CNRS. |
---|
8 | # All Rights Reservedâ |
---|
9 | # @svn_file $Id: analyzer 2545 2013-02-01 09:58:10Z jripsl $ |
---|
10 | # @version $Rev: 2545 $ |
---|
11 | # @lastrevision $Date: 2013-02-01 10:58:10 +0100 (Fri, 01 Feb 2013) $ |
---|
12 | # @license CeCILL (http://dods.ipsl.jussieu.fr/jripsl/smon/LICENSE) |
---|
13 | ################################## |
---|
14 | |
---|
15 | import sys |
---|
16 | import signal |
---|
17 | import traceback |
---|
18 | import smtplib |
---|
19 | from email.mime.text import MIMEText |
---|
20 | import time; |
---|
21 | from datetime import datetime |
---|
22 | |
---|
23 | |
---|
24 | # line below is to include "smon" package in the search path |
---|
25 | sys.path.append("/home/jripsl/snapshot/Monitoring") |
---|
26 | |
---|
27 | import smon.repo_io as repo_io |
---|
28 | import smon.types as types |
---|
29 | |
---|
30 | |
---|
31 | |
---|
32 | class CheckList(): |
---|
33 | max_time_between_msg=10 # unit => seconds |
---|
34 | |
---|
35 | @classmethod |
---|
36 | def datetime_to_epoch(cls,datetime): |
---|
37 | epoch = time.mktime(time.strptime(datetime, "%d.%m.%Y %H:%M:%S")).time(); # assuming datetime format is "29.08.2011 11:05:02" |
---|
38 | return epoch |
---|
39 | |
---|
40 | @classmethod |
---|
41 | def msg_timeout(cls,message): |
---|
42 | |
---|
43 | msg_time=cls.datetime_to_epoch(message.timestamp) |
---|
44 | current_time=time.time() |
---|
45 | |
---|
46 | diff=current_time-msg_time |
---|
47 | |
---|
48 | # debug |
---|
49 | print "diff=%s"%diff |
---|
50 | |
---|
51 | if diff>cls.max_time_between_msg: |
---|
52 | |
---|
53 | return True |
---|
54 | else: |
---|
55 | return False |
---|
56 | |
---|
57 | @classmethod |
---|
58 | def C0001(cls): |
---|
59 | """ |
---|
60 | description |
---|
61 | check heartbeat (currently, heartbeat is implemented using simulation progress messages). |
---|
62 | if progress messages suddently stops, it is likely that the simulation was Killed or |
---|
63 | a Segfault occurs. In such case, we inform the other components (failover, prodiguer GUI..) by |
---|
64 | changing the simulation status |
---|
65 | """ |
---|
66 | |
---|
67 | |
---|
68 | for simulation in repo_io.get_running_simulations(): |
---|
69 | |
---|
70 | print "checking heartbeat for '%s'"%simulation.name |
---|
71 | |
---|
72 | try: |
---|
73 | |
---|
74 | message=repo_io.retrieve_last_message(simulation) |
---|
75 | except types.MessageNotFoundException, e: |
---|
76 | |
---|
77 | continue |
---|
78 | |
---|
79 | |
---|
80 | if msg_timeout(message): |
---|
81 | |
---|
82 | simulation.status="error" |
---|
83 | |
---|
84 | repo_io.update_simulation_status(simulation) |
---|
85 | |
---|
86 | print "heartbeat NOK (simulation status set to 'error')"%simulation.name |
---|
87 | |
---|
88 | |
---|
89 | else: |
---|
90 | print "heartbeat OK"%simulation.name |
---|
91 | |
---|
92 | class Analyzer(): |
---|
93 | |
---|
94 | @classmethod |
---|
95 | def start(cls): |
---|
96 | repo_io.init() # open DB connection |
---|
97 | |
---|
98 | Analyzer.main() |
---|
99 | |
---|
100 | @classmethod |
---|
101 | def stop(cls): |
---|
102 | repo_io.free() # close DB connection |
---|
103 | |
---|
104 | @classmethod |
---|
105 | def main(self): |
---|
106 | |
---|
107 | """ |
---|
108 | # parse args |
---|
109 | parser = argparse.ArgumentParser(prog='analyzer') |
---|
110 | parser.add_argument('-v', dest='verbose',required=False,action='store_true') |
---|
111 | args = parser.parse_args() |
---|
112 | |
---|
113 | # check |
---|
114 | if not os.path.exists(SMON.smon_home): |
---|
115 | sys.exit(1) |
---|
116 | |
---|
117 | SMON.init_singleton() |
---|
118 | """ |
---|
119 | |
---|
120 | print ' [*] Analyzer running. To exit press CTRL+C' |
---|
121 | |
---|
122 | while True: |
---|
123 | |
---|
124 | print "checking simulations heartbeats" |
---|
125 | |
---|
126 | CheckList.C0001() |
---|
127 | |
---|
128 | |
---|
129 | time.sleep(1) |
---|
130 | |
---|
131 | """ |
---|
132 | SMON.free_singleton() |
---|
133 | """ |
---|
134 | |
---|
135 | def signal_handler(signal, frame): |
---|
136 | print 'You pressed Ctrl+C!' |
---|
137 | |
---|
138 | Analyzer.stop() |
---|
139 | |
---|
140 | sys.exit(0) |
---|
141 | |
---|
142 | if __name__ == '__main__': |
---|
143 | |
---|
144 | signal.signal(signal.SIGINT, signal_handler) |
---|
145 | |
---|
146 | try: |
---|
147 | Analyzer.start() |
---|
148 | |
---|
149 | sys.exit(0) |
---|
150 | |
---|
151 | except Exception, e: |
---|
152 | |
---|
153 | traceback.print_exc() |
---|
154 | |
---|
155 | sys.exit(1) |
---|