/*
*	replicaManager.c - The replication manager (wb)
*
*	The replication manager runs as separate process and manages the 
*	communication between local transactions, remote transactions and
*	the group communication system. The main components of the rmgr
*	are:
*		RmgrSock - a server socket on which the rmgr listens for
*					connection requests from transactions (local
*					and remote)
*		GCconn - a structure encapsulating the socket to the
*				 group communication process
*		RconnList - list containing one Rconn object for each 
*					active transaction (remote and local). Rconn
*					objects are created whenever a connection
*					request is accepted on the RmgrSock socket.
*					Each Rconn object contains a socket to the
*					associated transaction
*		waitForBknd - a pointer to an Rconn object that is to 
*					  be processed. (see RmgrReceive)
*		handleVector - a 3x3 matrix containing references to
*					   handling procedures used to process
*					   pending data on various sockets. The
*					   matrix is accessed by using the 
*					   source of the data as first index, and
*					   the destination as second index. The 
*					   matrix is not fully used, since for
*					   example there exists no communication
*					   from local to remote txn's
*		active_rsocks - bitmask identifying all the sockets
*						that must be listened on
*						(see manpages on select())
*
*	The basic control flow in the replication manager looks
*	as follows:
*
*	The <RmgrMain> routine runs an endless loop with a select
*	statement at its beginning. It thus blocks until at least
*	one of the sockets reports pending data.
*	If a request is pending on the RmgrSock socket, a new
*	Rconn object is generated and added to the list.
*	If a request is pending on any other socket, the
*	<RmgrReceive> function is called. This function
*	is responsible of finding receiving a whole message
*   When a message has arrived in it entirety, the appropriate
*   message handling procedure is called.
*	After the data has been handled, the control 
*	returns to the <RmgrMain> procedure which will continue to
*	monitor the sockets.
*
*	The handling procedures implement a state machine
*	which acts upon the current state of the connection
*	and the type of message received.
*
*	The rmgr message format used consists of
*		- a 8-byte header containing the message type (4 bytes)
*		  and the length of the message data, header excluded(4 bytes)
*		- a variable amount of data, the lenght of which is specified
*		  in the header

*	When sent over the group communication system, this message is
*	preceded by an 8-byte header consisting of a 4-byte hostid and
*	a 4 byte xid. Since xid's are not unique in a distributed
*	environment, the unique hostid must be used to identify all
*	postgres backends associated to a replicated transaction 
*	(i.e. one local txn plus its corresponding remote txns)
*/
#include "postgres.h"
#include "miscadmin.h"

#include <sys/types.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/times.h>
#include <limits.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>

#ifdef HAVE_SYS_SELECT_H
#include <sys/select.h>
#endif

#include "lib/dllist.h"
#include "libpq/libpq.h"
#include "libpq/bufferedSock.h"
#include "libpq/pqcomm.h"
#include "replication/replication.h"
#include "storage/ipc.h"

#define INVALID_SOCK (-1)

#define MAX_HOSTNAMELEN 64


/*
*	points Rconn to that is waiting for its associated
*	backend to startup (used with remote bknd startup)
*/
static Rconn *waitForBknd = NULL;
/*
*	list of all active connections
*/
static Dllist *RconnList = NULL;
/*
*	server socket info
*/
static int ReplicaMgrSock = INVALID_SOCK;
static struct sockaddr_un ReplicaMgrAddr;
static short ReplicaMgrPort = RMGR_PORTNUM;
/*
*	the machine's host identifier
*/
static uint32 myHostId;
/*
*	connection to group communication process
*/
static DataConn GCconn;
/*
*	handling function type decl
*/
typedef void (*handleReadFunc)(Rconn *conn);
/*
*	routing matrix
*/
static handleReadFunc handleVector[3][3];
/*
*	active sockets bitmask
*/
static fd_set	active_rsocks;
static int		maxSock = -1;


static void enableSocket(int sock);
static void disableSocket(int sock);
static void destroyRconn(Rconn *conn);
static Rconn* initRconn(uint32 hid, uint32 xid, bufsockptr bsock);

static bool RmgrReceive(DataConn *src);
static bool RmgrAccept(void);
static void RmgrCleanup(int dummy);
static void RmgrStartBackend(int hostid, int xid);
static Rconn* RmgrFindConn(long hostid, long xid, bool remove);

/*
*	connection handling functions handle_<src>2<dst>
*/
static void handle_rtxn2gc(Rconn *conn);
static void handle_ltxn2gc(Rconn *conn);
static void handle_gc2ltxn(Rconn *conn);
static void handle_gc2rtxn(Rconn *conn);

/*
*	RmgrCleanup
*
*	cleans up the rmgr global data structures
*
*	param:
*		dummy - param needed since this function is registered
*			with the on_proc_exit function. on_proc_exit
*			requires the function to have this type
*	return:
*		void
*/
static void
RmgrCleanup(int dummy)
{
  Dlelem 	*curr,
    *next;
  
  /*
   *	free elems in the RconnList list
   */
  curr = DLGetHead(RconnList);
  while(curr){
    Rconn *conn = (Rconn *) DLE_VAL(curr);
    
    next = DLGetSucc(curr);
    if(sockIsConnected(conn->dc.bsock))
      {
	sockPutInt((int) MSG_CLOSING, 4, conn->dc.bsock);
	sockFlush(conn->dc.bsock);
      }
    destroyRconn(conn);
    DLRemove(curr);
    curr = next;
  }
  /*
   *	clean up global sockets
   */
  close(ReplicaMgrSock);
  sockUnlink(ReplicaMgrAddr.sun_path);
  sockPutInt((int) MSG_CLOSING, 4, GCconn.bsock);
  sockFlush(GCconn.bsock);
  sockDestroy(GCconn.bsock);
#ifdef RMGR_STAT
  LogFile_close();
#endif
}

/*
 *	HandleProtoError
 *
 *	Invoked by handling procs if protocol error occurs
 *
 *	param:
 *		conn - pointer to the connection struct reporting 
 *			the error
 *	return:
 *		void	
 */
void
HandleProtoError(Rconn * conn)
{
  elog(NOTICE, "[%d] reports protocol error!!!", MyProcPid);
  conn->state = CLOSED;
  sockPutInt((int) MSG_PROTO_ERROR, 4, conn->dc.bsock);
  sockFlush(conn->dc.bsock);
  sockClose(conn->dc.bsock);
  proc_exit(200);
}

/*
 *	RmgrMain
 *
 *	Main loop of the replication manager. Listens for requests on 
 *	RmgrSock socket and handles pending data on other sockets.
 *
 *	param:
 *		gcsock - socket to the group communication process
 *	return:
 *		int - dummy return value
 */

int
RmgrMain(int gcsock)
{
  fd_set		tmp_rmask;
  Dlelem	   	*curr,
    *next;
#ifdef RMGR_DEBUG
  elog(NOTICE, "ReplicaMgr pid %d", getpid());
#endif
  myHostId = gethostid();
  on_proc_exit(RmgrCleanup, NULL);
  
  /*
   *	initialization segment
   */
  MemSet(&GCconn, 0, sizeof(GCconn));
  GCconn.recv_state = RECV_IDLE;
  GCconn.datalen = -1;
  GCconn.bsock = sockInit(AF_UNIX_BUFSIZE);
  GCconn.bsock->sock = gcsock;
  
  RconnList = DLNewList();
  
  MemSet(&handleVector, 0, sizeof(handleVector));
  handleVector[REMOTE][GROUPCOMM] = handle_rtxn2gc;
  handleVector[LOCAL][GROUPCOMM] = handle_ltxn2gc;
  handleVector[GROUPCOMM][LOCAL] = handle_gc2ltxn;
  handleVector[GROUPCOMM][REMOTE] = handle_gc2rtxn;
  
  /*
   *	set up server port
   */
  if(sockServerPort(NULL, ReplicaMgrPort, &ReplicaMgrSock, 
		    (struct sockaddr *) &ReplicaMgrAddr) != STATUS_OK)
    {
      elog(DEBUG, "RmgrMain: unable to get server socket");
      proc_exit(200);
    }
  
  FD_ZERO(&active_rsocks);
  enableSocket(ReplicaMgrSock);
  enableSocket(GCconn.bsock->sock);
  
#ifdef RMGR_STAT
  LogFile_init();
#endif
  
  for(;;)
    {	
      /*
       *	listen on all active sockets
       */
#ifdef RMGR_DEBUG
      elog(NOTICE, "Rmgr: listening on active sockets");
#endif
      memmove((char *) &tmp_rmask, (char *) &active_rsocks, 
	      sizeof(fd_set));
      
      if(select(maxSock + 1,  &tmp_rmask, (fd_set *) NULL, (fd_set *) NULL,
		(struct timeval *) NULL) < 0)
	{
	  if (errno == EINTR)
	    continue;
	  elog(ERROR, "RmgrMain: select failed (errno=%d)", errno);
	  return STATUS_ERROR;
	}
      
      /*
       *	Handle messages from the group communication system
       */
      if (GCconn.bsock->sock != INVALID_SOCK &&
	  FD_ISSET(GCconn.bsock->sock, &tmp_rmask))
	{
	  int nread = 0,
	    ntotal = 0;
	  uint32	hostid = -1,
	    xid = -1;
	  bufsockptr bsock = GCconn.bsock;
	  Rconn *conn = NULL;
	  route_addr_t dst;
	  
	  do{
	    if((nread = sockReadData(bsock, BKND_ID_SIZE - ntotal, TRUE)) < 0){
	      elog(NOTICE, 
		   "RMgr: unable to read group comm. msg header");
	      return STATUS_ERROR;
	    }
	    ntotal += nread;
	  }while(ntotal < BKND_ID_SIZE);
	  
	  sockGetInt(&hostid, 4, bsock);
	  sockGetInt(&xid, 4, bsock);
	  sockConsume(bsock);
	  
	  if((conn = RmgrFindConn(hostid, xid, FALSE)) == NULL)
	    {
				/*
				 *	generate connection object (always for remote bknd)
				 */
	      if((conn = initRconn(hostid, xid, NULL)) == NULL)
		{	
		  elog(NOTICE, "RMgr: Rconn memory alloc failed");
		  return STATUS_ERROR;
		}
	      DLAddHead(RconnList, DLNewElem(conn));
	      waitForBknd = conn;
	    }
	  
	  /*
	   *	Try to receive the message. If it can't be received
	   *	entirely, go back to main loop
	   */
	  if(!RmgrReceive(&GCconn))
	    {
	      continue;
	    }
	  
	  /*
	   *	The message has now been received entirely
	   */
	  
	  dst = (hostid == myHostId) ? LOCAL : REMOTE;
	  
#ifdef RMGR_DEBUG
	  elog(NOTICE, "Rmgr: Routing.... src=%d, dst=%d", 
	       (int) GROUPCOMM, (int) dst);
#endif
	  /*
	   *	call the appropriate handling procedure
	   */
	  handleVector[GROUPCOMM][dst](conn);
	  
#ifdef RMGR_DEBUG
	  elog(NOTICE, "Rmgr: done... (%d left in buffer)", 
	       sockNumLeft(GCconn.bsock));
	  elog(NOTICE, "\n");
#endif
	  
	}
      
      
      /*
       *	Go through the list of open backend connections and
       *	invoke routing fcnt if pending output
       */
      curr = DLGetHead(RconnList);
      while(curr){
	
	Rconn *conn = (Rconn *) DLE_VAL(curr);
	int sock;
	route_addr_t src;
	
	next = DLGetSucc(curr);
	/*
	 *	If the socket pointer is null, this is a dummy object...
	 */
	if(conn->dc.bsock != NULL)
	  {
	    sock = conn->dc.bsock->sock;
	    if(FD_ISSET(sock, &tmp_rmask))
	      {
		if(conn->state == CLOSED)
		  {
		    disableSocket(sock);
		    DLRemove(curr);
		    destroyRconn(conn);
		  }
		else
		  {
		    if(!RmgrReceive(&conn->dc))
		      {
			curr = next;
			continue;
		      }
		    
		    src = (conn->hostid == myHostId) ? LOCAL : REMOTE;
		    
#ifdef RMGR_DEBUG
		    elog(NOTICE, "Rmgr: Routing.... src=%d, dst=%d", 
			 (int) src, (int) GROUPCOMM);
#endif
		    /*
		     *	call the appropriate handling procedure
		     */
		    handleVector[src][GROUPCOMM](conn);
#ifdef RMGR_DEBUG
		    elog(NOTICE, "Rmgr: done... (%d left in buffer)", 
			 sockNumLeft(conn->dc.bsock));
		    elog(NOTICE, "\n");
#endif
		  }
	      }
	  }
	else 
	  {
	    if(conn->state == CLOSED)
	      {
		DLRemove(curr);
		destroyRconn(conn);	
	      }
	  }
	curr = next;
      }
 
      /*
       *	Handle connect request from backends
       */
      
      if (ReplicaMgrSock != INVALID_SOCK &&
	  FD_ISSET(ReplicaMgrSock, &tmp_rmask))
	{
	  if(!RmgrAccept())
	    {
	      elog(DEBUG, "Rmgr: accept failed");
	      proc_exit(201);
	    }
	}
      
    }
  
}

/*
 *	RmgrStartBackend
 *
 *	Starts a backend on behalf of the replication manager. The postgres 
 *   postmaster is requested to start a new backend. After the authentication 
 *   phase, the rmgr sends the hostid and xid of the LOCAL transaction 
 *   associated to this remote transaction. Remember that the local txn is not 
 *   running on the same machine as the remote txn, and consequently does not 
 *   necessarily have the same xid.	Hostid and xid are used to identify the 
 *   sets of backends that belong to	one single replicated transaction. 
 *   After this has been done, the connection is closed, since the backend 
 *   will now contact the rmgr on the well-known server socket.
 *
*	param:
*		hostid - the hostid identifying the local txn's host
*		xid - the xid identifying the local txn's xid
*	return:
*		void
*
*	NOTE:
*		For now, a full replication approach is used, an the database names
*		are supposed to be repl_<hostname>. This is not a very flexible 
*		scheme and should be modified in the future 
*/			

static void
RmgrStartBackend(int hostid, int xid)
{
  char 			c,
    hostname[MAX_HOSTNAMELEN+1];
  int 			areq = -1;
  StartupPacket 	startpkt;
  bufsockptr		backendsock = sockInit(AF_UNIX_BUFSIZE);
  
  MemSet(&startpkt, 0, sizeof(startpkt));
  startpkt.protoVersion = PG_PROTOCOL_LATEST;
  if(gethostname(hostname, MAX_HOSTNAMELEN) != 0)
    {
      proc_exit(102);
    }
  /*
   *	database name and user name are hardcoded :-(
   */
  hostname[MAX_HOSTNAMELEN + 1] = '\0';
  strcpy(startpkt.database, "repl_");
  strcat(startpkt.database, hostname);
  strcpy(startpkt.user, "kemme\0");
  strcpy(startpkt.options, "-r\0");
  if(sockClientConnect(NULL, 5432, backendsock) != STATUS_OK)
    {
      proc_exit(201);
    }
  /*
   *	send start packet and receive authentication request
   *	Authentication is assumed to be implicit for internal
   *	backends
   */
  sockPutInt(4+sizeof(startpkt), 4, backendsock);
  sockPutnchar((char *)&startpkt, sizeof(startpkt), backendsock);
  sockFlush(backendsock);
  sockWait(TRUE, FALSE, backendsock);
  sockReadData(backendsock, -1, FALSE);
  sockGetc(&c,backendsock);
  sockGetInt(&areq, 4, backendsock);
  sockConsume(backendsock);
  if(c != 'R' && areq != AUTH_REQ_OK){
    sockDestroy(backendsock);
    elog(NOTICE, "BackendStartup failed");
    proc_exit(201);
  }
  
  /*
   *	send hostid and xid to backend
   */
  sockPutInt(hostid, 4, backendsock);
  sockPutInt(xid, 4, backendsock);
  sockFlush(backendsock);
  sockDestroy(backendsock);
  
  /*
   * Disable the GCconn socket. This must be done since we must wait
   * For the remote backend to be started and to process the message that
   * is still in the GCconn buffer, before receivibg the next message.
   */
  disableSocket(GCconn.bsock->sock);
}

/*
 *	RmgrReceive
 *
*	This function handles a connection which reports pending data.
*	It reads a whole message (not more) into the buffer. 
*
*	If a message does not arrive in one chunk, this function returns
*	control to the RmgrMain routine, which will continue to monitor
*	the sockets. When the next data chunk arrives, this function
*	will read it and check wether the message is complete. If not,
*	it again returns control. Keep in mind that this func may be
*	invoked several times before completing sucessfully.
*	This implies for example that local vars are not valid across
*	calls....
*	
*	This approach has been adopted so as not to block upon receipt
*	of an incomplete message, which may improve performance under 
*	high load.
*
*	params:
*		src - the connection reporting pending data
*	return:
*		bool - TRUE if message complete, FALSE otherwise
*/
static bool
RmgrReceive(DataConn *src)
{	
  char 			header[RMGR_HDR_SIZE];
  int 			nread = 0;
  bufsockptr		bsock = src->bsock;
  
  /*
   * message reception
   *
   * This switch statement is called repeatedly in subsequent
   * calls of this function, until the message has arrived 
   * completely. Only then the code after the switch stat is
   * executed. State is saved in the connection object. Remember
   * that data stored in local vars does not remain valid
   * across subsequent calls of RmgrReceiveAndRoute! 
   *
   */
  switch(src->recv_state)
    {
    case RECV_IDLE:
      src->datalen = RMGR_HDR_SIZE;
      src->recv_state = RECV_HDR;
      /*deliberately NO break*/
      
    case RECV_HDR:
      if((nread = sockReadData(bsock, src->datalen, TRUE)) < 0){
	elog(NOTICE, "RMgr: read error occurred");
	return FALSE;
      }
#ifdef RMGR_DEBUG
      elog(NOTICE, "\n");
      elog(NOTICE, "Rmgr: Receiving header...(socket=%d, dlen=%d, nread=%d)", 
	   (int)src->bsock->sock, src->datalen, nread);
#endif
      src->datalen -= nread;
      if(src->datalen > 0)
	return FALSE;
      sockPeekHeader(RMGR_HDR_SIZE , bsock, header);
      src->datalen = (int) ntohl(*((uint32 *) (header + RMGR_HDR_SIZE - 4)));
      src->recv_state = RECV_DATA;
      /*deliberately NO break*/
      
    case RECV_DATA:
      if((nread = sockReadData(bsock, src->datalen, TRUE)) < 0){
	elog(NOTICE, "RMgr: read error occurred");
	return FALSE;
      }
#ifdef RMGR_DEBUG
      elog(NOTICE, "Rmgr: Receiving data...(socket=%d, dlen=%d, nread=%d)", 
	   (int)src->bsock->sock, src->datalen, nread);
#endif
      src->datalen -= nread;
      if(src->datalen > 0)
	return FALSE;
      src->recv_state = RECV_IDLE;
      break;
    }
  return TRUE;
}

/*
*	RmgrAccept
*
*	Responsible for handling RmgrSock requests, i.e. connection requests by
*	transactions. Data transferred by the backend is a hostid and a xid. For
*	remote backends, these ids are matched against the ids stored in 
*	struct pointed to by <waitForBknd>: for each remote bknd, this connection
*	struct must already exist. In every other case, a new connection struct 
*	is generated and inserted into the Rconn list
*	
*	param:
*		void
*	return:
*		bool - TRUE if accept was sucessful, FALSE otherwise
*/
static bool
RmgrAccept(void)
{
  Rconn 		*conn = NULL;
  rc_msg_t	msg_type = MSG_PROTO_ERROR;
  int 		datalen = -1,
    hostid = -1,
    xid = -1;
  bufsockptr 	newsock = NULL; 
  
  /*
   *	accept the connection and read the opening msg
   */
  newsock = sockInit(AF_UNIX_BUFSIZE);
  if (sockServerAccept(ReplicaMgrSock, newsock) != STATUS_OK)
    {
      sockDestroy(newsock);
      return FALSE;
    }
  else
    {	
      int nread = 0;
      do{
	sockWait(TRUE, FALSE, newsock);
	nread = sockReadData(newsock, RMGR_HDR_SIZE - nread, TRUE);
	if(nread < 0){
	  sockDestroy(newsock);
	  proc_exit(203);
	}
      }while(nread < RMGR_HDR_SIZE);
      sockGetInt((int *) &msg_type, 4, newsock);
      sockGetInt(&datalen, 4, newsock);
      if(msg_type != MSG_OPENING || datalen != 8)
	{
	  sockDestroy(newsock);
	  proc_exit(204);
	}
      nread = 0;
      do{
	sockWait(TRUE, FALSE, newsock);
	nread = sockReadData(newsock, datalen - nread, TRUE);
	if(nread < 0)
	  {
	    sockDestroy(newsock);
	    proc_exit(202);
	  }
      }while(nread < datalen);
      sockGetInt(&hostid, 4, newsock);
      sockGetInt(&xid, 4, newsock);
      sockConsume(newsock);
#ifdef RMGR_DEBUG
      elog(NOTICE,"RMgr: connection accepted (sock=%d, msg=%d, len=%d, xid=%d, hid=%x)",
	   newsock->sock, (int) msg_type, datalen, xid, hostid);
#endif	
      /*
       *	generate connection struct if necessary, else use
       *	<waitForBknd> pointer.
       */
      if(hostid == myHostId)
	{
	  if(!(conn = initRconn(hostid, xid, newsock)))
	    {
	      elog(DEBUG, "Rmgr: Rconn initialization failed");
	      return FALSE;
	    }
	  
	  DLAddHead(RconnList, DLNewElem(conn));
	}
      else
	{
	  if(!waitForBknd)
	    {
	      elog(DEBUG, "Rmgr: remote backend connection structure not found");
	      return FALSE;
	    }
	  conn = waitForBknd;
	  waitForBknd = NULL;
	  conn->dc.bsock = newsock;
	  conn->state = BKND_PRESENT;
	  
	  /*
	   * enable GCconn socket (disabled in RmgrStartBackend)
	   */
	  enableSocket(GCconn.bsock->sock);
	  
	  /*
	   * handle the message that triggered the startup...
	   * this must be done AFTER enabling the GCconn
	   * socket since handle_gc2rtxn might disable it again...
	   */	
	  handle_gc2rtxn(conn);
	}
      
      enableSocket(conn->dc.bsock->sock);
      
      return TRUE;
    }
}

/*
*	handle_ltxn2gc
*
*	Handles msgs coming from a local txn. First, we peek at the header
*	to see what the message type is. 
*	Action is taken according to he message type and the current state
*	of the connection.
*
*	param:
*		conn - the connection object to be handled
*	return:
*		void
*
*	NOTE: Peeking at a socket buffer leaves the data unprocessed in 
* 		  the buffer, so be sure to either process or ignore it afterwards
*		  Peeking is used to minimize the overhead with socket-to-socket
*		  data transfer.
*/
static void
handle_ltxn2gc(Rconn *conn)
{
  rc_msg_t 	msg_type;
  char 		header[RMGR_HDR_SIZE];
  int 		datalen = -1;
  bufsockptr	ltxnsock = conn->dc.bsock;
  
  sockPeekHeader(RMGR_HDR_SIZE, ltxnsock, header);
  msg_type = (rc_msg_t) ntohl(*((uint32 *)header));
  datalen = (int) ntohl(*((uint32 *) (header + 4)));
  
#ifdef RMGR_DEBUG
  elog(NOTICE, "ltxn2gc: received msg type %d on socket %d (len=%d), state is %d",
       (int) msg_type, ltxnsock->sock, datalen, (int) conn->state);
#endif
  
  if(msg_type == MSG_PROTO_ERROR){
    sockClose(ltxnsock);
    conn->state = CLOSED;
    sockIgnoreData(RMGR_HDR_SIZE, ltxnsock);
    sockConsume(ltxnsock);
    return;
  }
  
  switch(conn->state)
    {
    case READ_PHASE:
      switch(msg_type)
	{
	case MSG_WRITESET:
#ifdef RMGR_STAT
	  LogFile_record(TRUE, conn->xid, "READ->SEND on MSG_WS");
#endif
	  /*write group comm header*/
	  sockPutInt(myHostId, 4, GCconn.bsock);
	  sockPutInt(conn->xid, 4, GCconn.bsock);
	  /*transfer write set*/
	  sockTransfer(GCconn.bsock, ltxnsock, datalen + RMGR_HDR_SIZE);
	  sockFlush(GCconn.bsock);
	  conn->state = SEND_PHASE;
	  break;
	default:
	  HandleProtoError(conn);
	  break;
	}
      break;
    case SEND_PHASE:
      switch(msg_type)
	{
	case MSG_ABORT:
#ifdef RMGR_STAT
	  LogFile_record(TRUE, conn->xid, "SEND->ABORT on MSG_ABORT");
#endif
	  sockPutInt(myHostId, 4, GCconn.bsock);
	  sockPutInt(conn->xid, 4, GCconn.bsock);
	  sockTransfer(GCconn.bsock, ltxnsock, 
		       datalen + RMGR_HDR_SIZE);
	  sockFlush(GCconn.bsock);
	  disableSocket(ltxnsock->sock); 
	  /*
	   * disabling the socket is necessary since the backend
	   * itself may disconnect at any time, which makes the
	   * corresponding socket on rmgr side invalid. This socket will
	   * then report pending data endlessly, without any real 
	   * data being present
	   */
	  conn->state = ABORTED;
	  break;
	default:
	  HandleProtoError(conn);
	  break;
	}
      break;
    case LOCK_PHASE:
      switch(msg_type)
	{
	case MSG_ABORT:
#ifdef RMGR_STAT
	  LogFile_record(TRUE, conn->xid, "LOCK->ABORTED on MSG_ABORT");
#endif
	  sockPutInt(myHostId, 4, GCconn.bsock);
	  sockPutInt(conn->xid, 4, GCconn.bsock);
	  sockTransfer(GCconn.bsock, ltxnsock, datalen + RMGR_HDR_SIZE);
	  sockFlush(GCconn.bsock);
	  disableSocket(ltxnsock->sock);
	  enableSocket(GCconn.bsock->sock);
	  conn->state = ABORTED;
	  break;
	case MSG_LOCKED:
#ifdef RMGR_STAT
	  LogFile_record(TRUE, conn->xid, "LOCK->WRITE on MSG_LOCKED");
#endif
	  /*
	   * MSG_LOCKED is interpreted as Commit message
	   */
	  sockPutInt(myHostId, 4, GCconn.bsock);
	  sockPutInt(conn->xid, 4, GCconn.bsock);
	  sockPutInt((int) MSG_COMMIT, 4, GCconn.bsock);
	  sockPutInt(0, 4, GCconn.bsock);
	  sockFlush(GCconn.bsock);
	  /*
	   *	don't listen to the backend socket any more,
	   *	since the backend will disconnect anytime
	   */
	  disableSocket(ltxnsock->sock);
	  /*
	   *	since lock phase is over, unblock the group 
	   *	communication process
	   */
	  enableSocket(GCconn.bsock->sock);
	  sockIgnoreData(datalen + RMGR_HDR_SIZE, ltxnsock);
	  conn->state = WRITE_PHASE;
	  break;
	default:
	  HandleProtoError(conn);
	  break;
	}
      break;
    default:
      sockIgnoreData(datalen + RMGR_HDR_SIZE, ltxnsock);
      break;
    }
  sockConsume(ltxnsock);
}

/*
*	handle_gc2ltxn
*
*	Handles msgs coming from the group comm process and intended for a local txn.
*	Action is taken according to he message type and the current state
*	of the connection.
*
*	param:
*		conn - the connection object to be handled
*	return:
*		void
*/
static void
handle_gc2ltxn(Rconn *conn)
{
  rc_msg_t	msg_type;
  char		header[RMGR_HDR_SIZE];
  int			datalen = 0;
  bufsockptr	ltxnsock = conn->dc.bsock;
  
  sockPeekHeader(RMGR_HDR_SIZE ,GCconn.bsock ,header);
  msg_type = (rc_msg_t) ntohl(*((uint32 *)header));
  datalen = (int) ntohl(*((uint32 *) (header + 4)));
#ifdef RMGR_DEBUG
  elog(NOTICE, "gc2ltxn: received msg type %d for socket %d (len=%d), state is %d",
       (int) msg_type, ltxnsock->sock, datalen, 
       (int) conn->state);
#endif
  
  
  conn->numDelivered++;
  switch(conn->state)
    {
    case SEND_PHASE:
      switch(msg_type)
	{
	case MSG_WRITESET:
#ifdef RMGR_STAT
	  LogFile_record(TRUE, conn->xid, "SEND->LOCK on MSG_WS");
#endif
	  sockIgnoreData(datalen + RMGR_HDR_SIZE, GCconn.bsock);
	  sockPutInt((int)MSG_WS_RECEIVED, 4, ltxnsock);
	  sockPutInt(0, 4, ltxnsock);
	  sockFlush(ltxnsock);
	  /*
	   *	block the group communication socket until 
	   *	the backend reports the end of its lock phase
	   */
	  disableSocket(GCconn.bsock->sock);
	  conn->state = LOCK_PHASE;
	  break;
	default:
	  HandleProtoError(conn);
	  break;
	}
      break;
    case WRITE_PHASE:
      switch(msg_type)
	{
	case MSG_COMMIT:
#ifdef RMGR_STAT
	  LogFile_record(TRUE, conn->xid, "WRITE->CLOSED on MSG_COMMIT");
#endif
	  sockIgnoreData(datalen + RMGR_HDR_SIZE, GCconn.bsock);
	  conn->state = CLOSED;
	  enableSocket(ltxnsock->sock);
	  break;
	default:
	  HandleProtoError(conn);
	  break;
	}
      break;
    case ABORTED:
      switch(msg_type)
	{
	case MSG_WRITESET:
	case MSG_ABORT:
	  switch(conn->numDelivered)
	    {
	    case 1:
	      sockIgnoreData(datalen + RMGR_HDR_SIZE, 
			     GCconn.bsock);
	      /*wait for second message in transit, no state change*/
	      break;
	    case 2:
#ifdef RMGR_STAT
	      LogFile_record(conn->hostid == myHostId, 
			     conn->xid, 
			     "ABORT->CLOSED on any MSG_WS | MSG_ABORT");
#endif
	      sockIgnoreData(datalen + RMGR_HDR_SIZE, 
			     GCconn.bsock);
	      enableSocket(ltxnsock->sock);
	      conn->state = CLOSED;
	      break;
	    default:
	      HandleProtoError(conn);
	      break;
	    }
	  break;
	default:
	  HandleProtoError(conn);
	  break;
	}
      break;
    default:
      sockIgnoreData(datalen + RMGR_HDR_SIZE, GCconn.bsock);
      break;
    }
  sockConsume(GCconn.bsock);
}

/*
*	handle_rtxn2gc
*
*	Handles msgs coming from a remote transaction.
*	Action is taken according to he message type and the current state
*	of the connection.
*
*	param:
*		conn - the connection object to be handled
*	return:
*		void
*/
static void
handle_rtxn2gc(Rconn *conn)
{
  rc_msg_t 	msg_type;
  int 		datalen = -1;
  char		header[RMGR_HDR_SIZE];
  bufsockptr	rtxnsock = conn->dc.bsock;
  
  
  sockPeekHeader(RMGR_HDR_SIZE, rtxnsock, header);
  msg_type = (rc_msg_t) ntohl(*((uint32 *)header));
  datalen = (int) ntohl(*((uint32 *) (header + 4)));
#ifdef RMGR_DEBUG
  elog(NOTICE, "rtxn2gc: received msg type %d on socket %d (len=%d), state is %d",
       (int) msg_type, rtxnsock->sock, datalen, conn->state);
#endif
  
  if(msg_type == MSG_PROTO_ERROR){
    conn->state = CLOSED;
    sockIgnoreData(RMGR_HDR_SIZE, rtxnsock);
    sockConsume(rtxnsock);
    return;
  }
  
  switch(conn->state)
    {
    case LOCK_PHASE:
      switch(msg_type)
	{
	case MSG_LOCKED:
	  switch(conn->numDelivered)
	    {
	    case 1:
#ifdef RMGR_STAT
	      LogFile_record(FALSE, conn->xid, 
			     "LOCK->WRITE on MSG_LOCKED");
#endif
	      sockIgnoreData(datalen + RMGR_HDR_SIZE, 
			     rtxnsock);
	      enableSocket(GCconn.bsock->sock);
	      conn->state = WRITE_PHASE;
	      break;
	    case 2:
#ifdef RMGR_STAT
	      LogFile_record(FALSE, conn->xid, 
			     "LOCK->CLOSED on MSG_LOCKED");
#endif	
	      sockIgnoreData(datalen + RMGR_HDR_SIZE, 
			     rtxnsock);
	      enableSocket(GCconn.bsock->sock);
	      conn->state = CLOSED;						
	      break;
	    default:
	      HandleProtoError(conn);
	      break;
	    }
	  break;
	default:
	  HandleProtoError(conn);
	  break;
	}
      break;
    default:
      sockIgnoreData(datalen + RMGR_HDR_SIZE, rtxnsock);
      break;
    }
  sockConsume(rtxnsock);
}

/*
*	handle_gc2rtxn
*
*	Handles msgs coming from the group comm process and intended for 
*	a remote txn.
*	Action is taken according to he message type and the current state
*	of the connection.
*
*	param:
*		conn - the connection object to be handled
*	return:
*		void
*/
static void
handle_gc2rtxn(Rconn *conn)
{
  char		header[RMGR_HDR_SIZE];
  rc_msg_t	msg_type;
  int			datalen = 0;
  bufsockptr	rtxnsock = NULL;
  
  sockPeekHeader(RMGR_HDR_SIZE ,GCconn.bsock ,header);
  msg_type = (rc_msg_t) ntohl(*((uint32 *)header));
  datalen = (int) ntohl(*((uint32 *) (header + 4)));
#ifdef RMGR_DEBUG
  elog(NOTICE, "gc2rtxn: received msg type %d  (len=%d), state is %d",
       (int) msg_type, datalen, conn->state);
#endif
  
  
  if(conn->state == NO_BKND &&
     conn->numDelivered == 0 && 
     msg_type != MSG_ABORT)
    {
      /*
       * the backend is only started if the FIRST message received
       * is NOT an abort message.
       */
#ifdef RMGR_STAT
      LogFile_record(FALSE, conn->xid, "START_BKND_BEGIN");
#endif
      RmgrStartBackend(conn->hostid, conn->xid);
      return;
    }
  
  rtxnsock = conn->dc.bsock;
  conn->numDelivered++;
  switch(conn->state)
    {
    case NO_BKND:
      switch(msg_type)
	{
	case MSG_ABORT:
#ifdef RMGR_DEBUG
	  elog(NOTICE, "Received early abort!!");
#endif
	  waitForBknd = NULL;
	  sockIgnoreData(RMGR_HDR_SIZE, GCconn.bsock);
	  break;
	case MSG_WRITESET:
#ifdef RMGR_STAT
	  LogFile_record(FALSE, conn->xid, 
			 "NO_BKND->CLOSED on MSG_WS");
#endif
	  sockIgnoreData(RMGR_HDR_SIZE + datalen, 
			 GCconn.bsock);
	  conn->state = CLOSED;
	  break;
	default:
	  HandleProtoError(conn);
	  break;
	}
      break;
    case BKND_PRESENT:
      switch(msg_type)
	{
	case MSG_WRITESET:
	  sockTransfer(rtxnsock, GCconn.bsock, 
		       datalen + RMGR_HDR_SIZE);
	  sockFlush(rtxnsock);
	  disableSocket(GCconn.bsock->sock);
	  conn->state = LOCK_PHASE;
#ifdef RMGR_STAT
	  LogFile_record(FALSE, conn->xid, "BKND_START_END");
#endif
	  break;
	case MSG_COMMIT:
#ifdef RMGR_DEBUG
	  elog(NOTICE, "Received early commit!!");
#endif
#ifdef RMGR_STAT
	  LogFile_record(FALSE, conn->xid, 
			 "BKND_PRESENT->COMMITTABLE on MSG_COMMIT");
#endif
	  conn->state = COMMITTABLE;
	  sockIgnoreData(RMGR_HDR_SIZE, GCconn.bsock);
	  break;
	default:
	  HandleProtoError(conn);
	  break;
	}
      break;
    case COMMITTABLE:
      switch(msg_type)
	{
	case MSG_WRITESET:
#ifdef RMGR_STAT
	  LogFile_record(FALSE, conn->xid, 
			 "COMMITTABLE->LOCK on MSG_WS");
#endif
	  sockTransfer(rtxnsock, GCconn.bsock, 
		       datalen + RMGR_HDR_SIZE);
	  sockPutInt((int) MSG_COMMIT, 4, rtxnsock);
	  sockPutInt(0, 4, rtxnsock);
	  sockFlush(rtxnsock);
	  disableSocket(GCconn.bsock->sock);
	  conn->state = LOCK_PHASE;
	  break;
	default:
	  HandleProtoError(conn);
	  break;
	}
      break;
    case WRITE_PHASE:
      switch(msg_type)
	{
	case MSG_COMMIT:
	case MSG_ABORT:
#ifdef RMGR_STAT
	  LogFile_record(FALSE, conn->xid, 
			 "WRITE->CLOSED on any MSG_COMMIT | MSG_ABORT");
#endif
	  sockTransfer(rtxnsock, GCconn.bsock, datalen + RMGR_HDR_SIZE);
	  sockFlush(rtxnsock);
	  conn->state = CLOSED;
	  break;
	default:
	  HandleProtoError(conn);
	  break;
	}
      break;
    default:
      sockIgnoreData(RMGR_HDR_SIZE + datalen, GCconn.bsock);
      break;
    }
  sockConsume(GCconn.bsock);
}

/*
*	RmgrFindConn
*
*	Finds (and deletes) an element in the RconnList list by its 
*	host- and xid.
*
*	param:
*		hostid - the hostid to look for
*		xid - the xid to look for
*		remove - bool telling wether to remove the elemt from
*				the list
*	return:
*		Rconn - pointer to the element, NULL if not found
*/

static	Rconn*
RmgrFindConn(long hostid, long xid, bool remove)
{
  Dlelem 	*curr = NULL;
  Rconn 	*conn = NULL;
  
  curr = DLGetHead(RconnList);
  while(curr){
    Rconn *currconn = (Rconn *) DLE_VAL(curr);
    if((currconn->hostid == hostid) && (currconn->xid == xid)){
      conn = currconn;
      if(remove)
	{
	  DLRemove(curr);
	  DLFreeElem(curr);
	}
      break;
    }
    curr = DLGetSucc(curr);
  }
  return conn;
}

/*
*	destroyRconn
*	
*	frees an Rconn structure, including the associated 
*	buffered socket
*	
*	param:
*		conn - pointer to the connection struct to free
*	return:
*		void
*/
static void
destroyRconn(Rconn *conn)
{
#ifdef RMGR_DEBUG
  elog(NOTICE, "destroying Rconn hid = %x, xid = %d", 
       conn->hostid, conn->xid);
#endif
  sockDestroy(conn->dc.bsock);
  free(conn);
}

/*
*	initRconn
*
*	allocates and initializes an Rconn struct
*
*	param:
*		void
*	return:
*		Rconn - the initialized Rconn struct
*/
static Rconn*
initRconn(uint32 hid, uint32 xid, bufsockptr bsock)
{
  Rconn *conn = NULL;
  
  if (!(conn = (Rconn *) malloc(sizeof(Rconn))))
    {
      elog(DEBUG,"initRconn: malloc failed");
      return NULL;	
    }
  conn->dc.bsock = bsock;
  conn->dc.recv_state = RECV_IDLE;
  conn->dc.datalen = -1;
  conn->hostid = hid;
  conn->xid = xid;
  conn->numDelivered = 0;
  conn->state = (hid == myHostId) ? READ_PHASE : NO_BKND;
  
  return conn;
}

/*
*	enableSocket
*	
*	activates a given socket, i.e. this socket will be monitored
*	on the next call of select()
*	
*	param:
*		sock - the socket to activate
*	return:
*		void
*/
static void
enableSocket(int sock)
{
	
  FD_SET(sock, &active_rsocks);
  if(maxSock < sock)
    maxSock = sock;
  //	elog(NOTICE, "enabled socket %d (max=%d)", sock, maxSock);
}

/*
*	disableSocket
*	
*	deactivates a given socket, i.e. this socket will not be monitored
*	on the next call of select()
*	
*	param:
*		sock - the socket to activate
*	return:
*		void
*/
static void
disableSocket(int sock)
{
  int i;
  
  //	elog(NOTICE, "disabling socket %d", sock);
  FD_CLR(sock, &active_rsocks);
  if(sock == maxSock)
    {
      for(i = maxSock-1; i >= 0; i--)
	{
	  if(FD_ISSET(i, &active_rsocks))
	    {
	      maxSock = i;
	      return;
	    }
	}
      
      maxSock = -1;
    }
}


#ifdef RMGR_BLAXYZ //these stat fctns are disbled for now!
/* statistic gathering functions*/

static void
_statTimeNow(Rconn *conn, int statSlotIndex)
{
  struct tms dummy;
  
  conn->lastStop = times(&dummy);
  conn->statSlot_ind = statSlotIndex;
}

static void
_statGather(int index, clock_t startTime)
{
  struct tms dummy;
  
  clock_t interval = times(&dummy) - startTime;
  
  if(interval < 0)
    {
      return;
    }
  
  if(tmax[index] == 0)
    {
      tmax[index] = interval;
      tmin[index] = interval;
      tsum[index] += interval;
    }
  else
    {
      if(tmax[index] < interval)
	{
	  tmax[index] = interval;
	}
      
      if(tmin[index] > interval)
	{
	  tmin[index] = interval;
	}
      
      tsum[index] += interval;
    }
}

static void
_statShow(void)
{
	
  printf("-----------------------------------------\n");
  printf("Replication manager statistics (node %x):\n", myHostId);
  printf("-----------------------------------------\n\n");
  
  printf("General stats:\n");
  printf("\tTotal Remote Txns: %d\n", NumRTxns);
  printf("\tTotal Local Txns: %d\n", NumLTxns);
  
  fflush(stdout);
  if(NumLTxns > 0)
    {
      printf("Write set size sent:\n\t\tmin = %d\n\t\tmax = %d\n\t\tavg = %1.3e",
	     WSmin, WSmax, (float)WSsum/(float) NumLTxns);
      
      printf("\nLocal Backends:\n");
      printf("\tSend Phase:\n\t\t min = %1.3e sec.\n\t\tmax = %1.3e sec.\n\t\tavg = %1.3e sec.\n",
	     (float)tmin[SendPhase_ind]/(float)CLK_TCK, 
	     (float)tmax[SendPhase_ind]/(float)CLK_TCK, 
	     (float)tsum[SendPhase_ind]/(float)(NumLTxns * CLK_TCK));
      printf("\tLock Phase:\n\t\tmin = %1.3e sec.\n\t\tmax = %1.3e  sec.\n\t\tavg = %1.3e  sec.\n",
	     (float)tmin[LockPhaseLocal_ind]/(float)CLK_TCK, 
	     (float)tmax[LockPhaseLocal_ind]/(float)CLK_TCK, 
	     (float)tsum[LockPhaseLocal_ind]/(float)(NumLTxns * CLK_TCK));
      printf("\tWrite Phase:\n\t\tmin = %1.3e  sec.\n\t\tmax = %1.3e  sec.\n\t\tavg = %1.3e  sec.\n",
	     (float)tmin[WritePhaseLocal_ind]/(float)CLK_TCK, 
	     (float)tmax[WritePhaseLocal_ind]/(float)CLK_TCK, 
	     (float)tsum[WritePhaseLocal_ind]/(float)(NumLTxns * CLK_TCK));
      fflush(stdout);
    } 
  
  if(NumRTxns > 0)
    {
      printf("\nRemote Backends:\n");
      printf("\tBackend startup:\n\t\tmin= %1.3e  sec.\n\t\tmax= %1.3e  sec.\n\t\tavg= %1.3e  sec.\n",
	     (float)tmin[StartupRemote_ind]/(float)CLK_TCK, 
	     (float)tmax[StartupRemote_ind]/(float)CLK_TCK, 
	     (float)tsum[StartupRemote_ind]/(float)(NumRTxns * CLK_TCK));
      printf("\tLock Phase:\n\t\tmin = %1.3e  sec.\n\t\tmax = %1.3e  sec.\n\t\tavg = %1.3e  sec.\n",
	     (float)tmin[LockPhaseRemote_ind]/(float)CLK_TCK, 
	     (float)tmax[LockPhaseRemote_ind]/(float)CLK_TCK, 
	     (float)tsum[LockPhaseRemote_ind]/(float)(NumRTxns * CLK_TCK));
      printf("\tWrite Phase:\n\t\tmin = %1.3e  sec.\n\t\tmax = %1.3e  sec.\n\t\tavg = %1.3e  sec.\n",
	     (float)tmin[WritePhaseRemote_ind]/(float)CLK_TCK, 
	     (float)tmax[WritePhaseRemote_ind]/(float)CLK_TCK, 
	     (float)tsum[WritePhaseRemote_ind]/(float)(NumRTxns * CLK_TCK));
      fflush(stdout);
    }
}
#endif

