You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

206 lines
7.5 KiB

  1. import random
  2. import sched
  3. import socket
  4. import time
  5. from threading import Thread
  6. from argparse import ArgumentParser
  7. from enum import Enum
  8. from xmlrpc.client import ServerProxy
  9. from xmlrpc.server import SimpleXMLRPCServer
  10. PORT = 1234
  11. CLUSTER = [1, 2, 3]
  12. ELECTION_TIMEOUT = (6, 8)
  13. HEARTBEAT_INTERVAL = 5
  14. class NodeState(Enum):
  15. """Enumerates the three possible node states (follower, candidate, or leader)"""
  16. FOLLOWER = 1
  17. CANDIDATE = 2
  18. LEADER = 3
  19. class Node:
  20. def __init__(self, node_id):
  21. """Non-blocking procedure to initialize all node parameters and start the first election timer"""
  22. self.node_id = node_id
  23. self.state = NodeState.FOLLOWER
  24. self.term = 0
  25. self.votes = {}
  26. self.log = []
  27. self.pending_entry = ''
  28. self.sched = sched.scheduler()
  29. self.event = ''
  30. # TODO: start election timer for this node
  31. self.reset_election_timer()
  32. print(f"Node started! State: {self.state}. Term: {self.term}")
  33. def is_leader(self):
  34. """Returns True if this node is the elected cluster leader and False otherwise"""
  35. if self.state == NodeState.LEADER:
  36. return True
  37. return False
  38. def reset_election_timer(self):
  39. """Resets election timer for this (follower or candidate) node and returns it to the follower state"""
  40. self.state = NodeState.FOLLOWER
  41. q = self.sched.queue
  42. for event in q:
  43. self.sched.cancel(event)
  44. #if (self.node_id == 1 or self.node_id == 3):
  45. # self.sched.enter(0, 1, self.hold_election, ())
  46. # return
  47. self.sched.enter(random.uniform(ELECTION_TIMEOUT[0], ELECTION_TIMEOUT[1]), 1, self.hold_election, ())
  48. def reset_heartbeat_timer(self):
  49. q = self.sched.queue
  50. for event in q:
  51. self.sched.cancel(event)
  52. self.sched.enter(HEARTBEAT_INTERVAL, 1, self.append_entries, ())
  53. def hold_election(self):
  54. """Called when this follower node is done waiting for a message from a leader (election timeout)
  55. The node increments term number, becomes a candidate and votes for itself.
  56. Then call request_vote over RPC for all other online nodes and collects their votes.
  57. If the node gets the majority of votes, it becomes a leader and starts the hearbeat timer
  58. If the node loses the election, it returns to the follower state and resets election timer.
  59. """
  60. self.term = self.term + 1
  61. self.state = NodeState.CANDIDATE
  62. self.votes = {}
  63. self.votes[self.node_id] = True
  64. print(f'New election term {self.term}. State: {self.state}')
  65. for n0 in CLUSTER:
  66. if node == self.node_id:
  67. continue
  68. try:
  69. print(f'Requesting vote from node {n0}')
  70. with ServerProxy(f'http://node_{n0}:{PORT}') as proxy:
  71. if proxy.request_vote(self.term, self.node_id):
  72. self.votes[n0] = True
  73. else:
  74. self.votes[n0] = False
  75. except Exception as e:
  76. print(f"couldn't request_vote from {n0}")
  77. print(traceback.format_exc())
  78. print(e)
  79. if sum(self.votes.values()) > len(CLUSTER) / 2:
  80. self.state = NodeState.LEADER
  81. self.reset_heartbeat_timer()
  82. print(f"New election term {self.term}. State: {self.state}")
  83. def request_vote(self, term, candidate_id):
  84. """Called remotely when a node requests voting from other nodes.
  85. Updates the term number if the received one is greater than `self.term`
  86. A node rejects the vote request if it's a leader or it already voted in this term.
  87. Returns True and update `self.votes` if the vote is granted to the requester candidate and False otherwise.
  88. """
  89. print(f"Got a vote request from {candidate_id} (term={term})")
  90. self.reset_election_timer()
  91. if term > self.term:
  92. self.term = term
  93. self.votes = {}
  94. if self.is_leader() or len(self.votes) > 0:
  95. return False
  96. self.votes[candidate_id] = True
  97. return True
  98. def append_entries(self):
  99. """Called by leader every HEARTBEAT_INTERVAL, sends a heartbeat message over RPC to all online followers.
  100. Accumulates ACKs from followers for a pending log entry (if any)
  101. If the majority of followers ACKed the entry, the entry is committed to the log and is no longer pending
  102. """
  103. print("Sending a heartbeat to followers")
  104. acks = 0
  105. for n0 in CLUSTER:
  106. if n0 == self.node_id:
  107. continue
  108. try:
  109. with ServerProxy(f'http://node_{n0}:{PORT}') as proxy:
  110. if proxy.heartbeat(self.pending_entry):
  111. acks = acks + 1
  112. except Exception as e:
  113. print(f"couldn't heartbeat {n0}")
  114. print(traceback.format_exc())
  115. print(e)
  116. if self.pending_entry != '' and acks > len(CLUSTER) / 2:
  117. self.log.append(self.pending_entry)
  118. print(f'Leader commited \'{self.pending_entry}\'')
  119. self.pending_entry = ''
  120. self.reset_heartbeat_timer()
  121. def heartbeat(self, leader_entry):
  122. """Called remotely from the leader to inform followers that it's alive and supply any pending log entry
  123. Followers would commit an entry if it was pending before, but is no longer now.
  124. Returns True to ACK the heartbeat and False on any problems.
  125. """
  126. print(f"Heartbeat received from leader (entry='{leader_entry}')")
  127. try:
  128. self.reset_election_timer()
  129. if self.pending_entry != '' and leader_entry != self.pending_entry:
  130. self.log.append(self.pending_entry)
  131. print(f'Follower commited \'{self.pending_entry}\'')
  132. self.pending_entry = leader_entry
  133. return True
  134. except Exception as e:
  135. return False
  136. def leader_receive_log(self, log):
  137. """Called remotely from the client. Executed only by the leader upon receiving a new log entry
  138. Returns True after the entry is committed to the leader log and False on any problems
  139. """
  140. print(f"Leader received log \'{log}\' from client")
  141. while self.pending_entry != '':
  142. time.sleep(1)
  143. self.pending_entry = log
  144. time.sleep(7)
  145. if self.pending_entry == '' and self.log[-1] == log:
  146. return True
  147. return False
  148. if __name__ == '__main__':
  149. # TODO: Parse one integer argument (node_id), then create the node with that ID.
  150. # TODO: Start RPC server on 0.0.0.0:PORT and expose the node instance
  151. # TODO: Run the node scheduler in an isolated thread.
  152. # TODO: Handle KeyboardInterrupt and terminate gracefully.
  153. try:
  154. parser = ArgumentParser()
  155. parser.add_argument('node_id')
  156. args = parser.parse_args()
  157. node = Node(int(args.node_id))
  158. t = Thread(target=node.sched.run)
  159. t.start()
  160. server = SimpleXMLRPCServer(('0.0.0.0', PORT), logRequests=False)
  161. print(f"Listening on port {PORT}...")
  162. server.register_function(node.leader_receive_log, "leader_receive_log")
  163. server.register_function(node.heartbeat, "heartbeat")
  164. server.register_function(node.request_vote, "request_vote")
  165. server.register_function(node.is_leader, "is_leader")
  166. server.serve_forever()
  167. except KeyboardInterrupt:
  168. print("node killed...")
  169. exit()