import random import sched import socket import time from threading import Thread from argparse import ArgumentParser from enum import Enum from xmlrpc.client import ServerProxy from xmlrpc.server import SimpleXMLRPCServer PORT = 1234 CLUSTER = [1, 2, 3] ELECTION_TIMEOUT = (6, 8) HEARTBEAT_INTERVAL = 5 class NodeState(Enum): """Enumerates the three possible node states (follower, candidate, or leader)""" FOLLOWER = 1 CANDIDATE = 2 LEADER = 3 class Node: def __init__(self, node_id): """Non-blocking procedure to initialize all node parameters and start the first election timer""" self.node_id = node_id self.state = NodeState.FOLLOWER self.term = 0 self.votes = {} self.log = [] self.pending_entry = '' self.sched = sched.scheduler() self.event = '' # TODO: start election timer for this node self.reset_election_timer() print(f"Node started! State: {self.state}. Term: {self.term}") def is_leader(self): """Returns True if this node is the elected cluster leader and False otherwise""" if self.state == NodeState.LEADER: return True return False def reset_election_timer(self): """Resets election timer for this (follower or candidate) node and returns it to the follower state""" self.state = NodeState.FOLLOWER q = self.sched.queue for event in q: self.sched.cancel(event) #if (self.node_id == 1 or self.node_id == 3): # self.sched.enter(0, 1, self.hold_election, ()) # return self.sched.enter(random.uniform(ELECTION_TIMEOUT[0], ELECTION_TIMEOUT[1]), 1, self.hold_election, ()) def reset_heartbeat_timer(self): q = self.sched.queue for event in q: self.sched.cancel(event) self.sched.enter(HEARTBEAT_INTERVAL, 1, self.append_entries, ()) def hold_election(self): """Called when this follower node is done waiting for a message from a leader (election timeout) The node increments term number, becomes a candidate and votes for itself. Then call request_vote over RPC for all other online nodes and collects their votes. If the node gets the majority of votes, it becomes a leader and starts the hearbeat timer If the node loses the election, it returns to the follower state and resets election timer. """ self.term = self.term + 1 self.state = NodeState.CANDIDATE self.votes = {} self.votes[self.node_id] = True print(f'New election term {self.term}. State: {self.state}') for n0 in CLUSTER: if node == self.node_id: continue try: print(f'Requesting vote from node {n0}') with ServerProxy(f'http://node_{n0}:{PORT}') as proxy: if proxy.request_vote(self.term, self.node_id): self.votes[n0] = True else: self.votes[n0] = False except Exception as e: print(f"couldn't request_vote from {n0}") print(traceback.format_exc()) print(e) if sum(self.votes.values()) > len(CLUSTER) / 2: self.state = NodeState.LEADER self.reset_heartbeat_timer() print(f"New election term {self.term}. State: {self.state}") def request_vote(self, term, candidate_id): """Called remotely when a node requests voting from other nodes. Updates the term number if the received one is greater than `self.term` A node rejects the vote request if it's a leader or it already voted in this term. Returns True and update `self.votes` if the vote is granted to the requester candidate and False otherwise. """ print(f"Got a vote request from {candidate_id} (term={term})") self.reset_election_timer() if term > self.term: self.term = term self.votes = {} if self.is_leader() or len(self.votes) > 0: return False self.votes[candidate_id] = True return True def append_entries(self): """Called by leader every HEARTBEAT_INTERVAL, sends a heartbeat message over RPC to all online followers. Accumulates ACKs from followers for a pending log entry (if any) If the majority of followers ACKed the entry, the entry is committed to the log and is no longer pending """ print("Sending a heartbeat to followers") acks = 0 for n0 in CLUSTER: if n0 == self.node_id: continue try: with ServerProxy(f'http://node_{n0}:{PORT}') as proxy: if proxy.heartbeat(self.pending_entry): acks = acks + 1 except Exception as e: print(f"couldn't heartbeat {n0}") print(traceback.format_exc()) print(e) if self.pending_entry != '' and acks > len(CLUSTER) / 2: self.log.append(self.pending_entry) print(f'Leader commited \'{self.pending_entry}\'') self.pending_entry = '' self.reset_heartbeat_timer() def heartbeat(self, leader_entry): """Called remotely from the leader to inform followers that it's alive and supply any pending log entry Followers would commit an entry if it was pending before, but is no longer now. Returns True to ACK the heartbeat and False on any problems. """ print(f"Heartbeat received from leader (entry='{leader_entry}')") try: self.reset_election_timer() if self.pending_entry != '' and leader_entry != self.pending_entry: self.log.append(self.pending_entry) print(f'Follower commited \'{self.pending_entry}\'') self.pending_entry = leader_entry return True except Exception as e: return False def leader_receive_log(self, log): """Called remotely from the client. Executed only by the leader upon receiving a new log entry Returns True after the entry is committed to the leader log and False on any problems """ print(f"Leader received log \'{log}\' from client") while self.pending_entry != '': time.sleep(1) self.pending_entry = log time.sleep(7) if self.pending_entry == '' and self.log[-1] == log: return True return False if __name__ == '__main__': # TODO: Parse one integer argument (node_id), then create the node with that ID. # TODO: Start RPC server on 0.0.0.0:PORT and expose the node instance # TODO: Run the node scheduler in an isolated thread. # TODO: Handle KeyboardInterrupt and terminate gracefully. try: parser = ArgumentParser() parser.add_argument('node_id') args = parser.parse_args() node = Node(int(args.node_id)) t = Thread(target=node.sched.run) t.start() server = SimpleXMLRPCServer(('0.0.0.0', PORT), logRequests=False) print(f"Listening on port {PORT}...") server.register_function(node.leader_receive_log, "leader_receive_log") server.register_function(node.heartbeat, "heartbeat") server.register_function(node.request_vote, "request_vote") server.register_function(node.is_leader, "is_leader") server.serve_forever() except KeyboardInterrupt: print("node killed...") exit()