Merge pull request #1854 from zcorrea/TRAFODION-3318

[TRAFODION-3318] Changed process management rules for DTM process:
diff --git a/core/sqf/monitor/test/monitor.env b/core/sqf/conf/monitor.env
similarity index 100%
rename from core/sqf/monitor/test/monitor.env
rename to core/sqf/conf/monitor.env
diff --git a/core/sqf/sql/scripts/nameserver.env b/core/sqf/conf/nameserver.env
similarity index 100%
rename from core/sqf/sql/scripts/nameserver.env
rename to core/sqf/conf/nameserver.env
diff --git a/core/sqf/sql/scripts/sqconfig.persist b/core/sqf/conf/sqconfig.persist
similarity index 100%
rename from core/sqf/sql/scripts/sqconfig.persist
rename to core/sqf/conf/sqconfig.persist
diff --git a/core/sqf/sql/scripts/sqconfig.sample b/core/sqf/conf/sqconfig.sample
similarity index 100%
rename from core/sqf/sql/scripts/sqconfig.sample
rename to core/sqf/conf/sqconfig.sample
diff --git a/core/sqf/export/include/common/evl_sqlog_eventnum.h b/core/sqf/export/include/common/evl_sqlog_eventnum.h
index 10268d8..c7e8879 100644
--- a/core/sqf/export/include/common/evl_sqlog_eventnum.h
+++ b/core/sqf/export/include/common/evl_sqlog_eventnum.h
@@ -97,6 +97,7 @@
 #define MON_CLUSTER_RESPONSIVE_1            101011701
 #define MON_CLUSTER_RESPONSIVE_2            101011702
 #define MON_CLUSTER_RESPONSIVE_3            101011703
+#define MON_CLUSTER_RESPONSIVE_4            101011704
 
 #define MON_CLUSTER_CONNTONEWMON_1          101011801
 #define MON_CLUSTER_CONNTONEWMON_2          101011802
@@ -233,15 +234,22 @@
 #define MON_CLUSTER_NO_LICENSE_VERIFIERS    101014601
 
 #define MON_CLUSTER_ALLGATHERSOCKRECONN_1   101014701
+#define MON_CLUSTER_ALLGATHERSOCKRECONN_2   101014702
 
 #define MON_CLUSTER_HARDNODEUP_1            101014801
 
 #define MON_CLUSTER_ACCEPTSOCKPEER_1        101014901
 #define MON_CLUSTER_ACCEPTSOCKPEER_2        101014902
+#define MON_CLUSTER_ACCEPTSOCKPEER_3        101014903
+#define MON_CLUSTER_ACCEPTSOCKPEER_4        101014904
+#define MON_CLUSTER_ACCEPTSOCKPEER_5        101014904
 
 #define MON_CLUSTER_CONNECTSOCKPEER_1       101015001
 #define MON_CLUSTER_CONNECTSOCKPEER_2       101015002
 #define MON_CLUSTER_CONNECTSOCKPEER_3       101015003
+#define MON_CLUSTER_CONNECTSOCKPEER_4       101015004
+#define MON_CLUSTER_CONNECTSOCKPEER_5       101015005
+#define MON_CLUSTER_CONNECTSOCKPEER_6       101015006
 
 #define MON_CLUSTER_EPOLLCTLDELETE_1        101015101
 
@@ -249,6 +257,12 @@
 #define MON_PINGSOCKPEER_2                  101015202
 #define MON_PINGSOCKPEER_3                  101015203
 #define MON_PINGSOCKPEER_4                  101015204
+#define MON_PINGSOCKPEER_5                  101015205
+#define MON_PINGSOCKPEER_6                  101015206
+#define MON_PINGSOCKPEER_7                  101015207
+#define MON_PINGSOCKPEER_8                  101015208
+#define MON_PINGSOCKPEER_9                  101015209
+#define MON_PINGSOCKPEER_10                 101015210
 
 #define MON_CLUSTER_ASSIGNMONITORLEADER_1   101015301
 #define MON_CLUSTER_ASSIGNMONITORLEADER_2   101015302
@@ -264,6 +278,10 @@
 
 #define MON_CLUSTER_SENDSOCK_1              101015701
 
+#define MON_CLUSTER_PINGRECONNSOCKPEER_1    101015801
+
+#define MON_CLUSTER_ENSUREANDGETSEQNUM_1    101015901
+
 /* Module: monitor.cxx = 02 */
 
 #define MON_MONITOR_MAIN_1                  101020101
@@ -284,6 +302,7 @@
 #define MON_MONITOR_MAIN_16                 101020116
 #define MON_MONITOR_MAIN_17                 101020117
 #define MON_MONITOR_MAIN_18                 101020118
+#define MON_MONITOR_MAIN_19                 101020119
 #define MON_MONITOR_TMLEADER_1              101020201
 #define MON_MONITOR_TMLEADER_2              101020202
 #define MON_MONITOR_DEATH_HANDLER_1         101020301
@@ -347,7 +366,10 @@
 #define MON_MONITOR_CREATEZCLIENT_1         101022301
 #define MON_MONITOR_CREATEZCLIENT_2         101022302
 #define MON_MONITOR_CREATEZCLIENT_3         101022303
+#define MON_MONITOR_CREATEZCLIENT_4         101022304
+#define MON_MONITOR_CREATEZCLIENT_5         101022305
 #define MON_MONITOR_STARTZCLIENT_1          101022401
+#define MON_MONITOR_SIGTERMSIGNALHANDLER_1  101022501
 
 /* Module: process.cxx = 03 */
 
@@ -437,6 +459,9 @@
 #define MON_PROCESS_PROCEXIT_1              101032801
 #define MON_PROCESS_PROCEXIT_2              101032802
 #define MON_PROCESS_PROCEXITUNREGALL_1      101032901
+#define MON_PROCESS_DUMP_1                  101033001
+#define MON_PROCESS_DUMP_BEGIN_1            101034001
+#define MON_PROCESS_DUMP_BEGIN_2            101034002
 
 /* Module: pnode.cxx = 04 */
 
@@ -461,11 +486,13 @@
 #define MON_NODE_ADDNODE_1                  101041101
 #define MON_NODE_ADDNODE_2                  101041102
 #define MON_NODE_ADDNODE_3                  101041103
+#define MON_NODE_ADDNODE_4                  101041103
 #define MON_NODE_ADDLNODES_1                101041201
 #define MON_NODE_ADDLNODES_2                101041202
 #define MON_NODE_ADDLNODES_3                101041203
 #define MON_NODE_ADDLNODES_4                101041204
 #define MON_NODE_DELETENODE_1               101041301
+#define MON_NODE_DELETENODE_2               101041302
 #define MON_NODE_STARTNAMESERVER_1          101041401
 #define MON_NODE_GETPROCESSNS_1             101041501
 #define MON_NODE_GETPROCESSNS_2             101041502
@@ -484,6 +511,8 @@
 #define MON_NODE_GETPROCESSLBYTYPENS_1      101041801
 #define MON_NODE_GETPROCESSLBYTYPENS_2      101041802
 #define MON_NODE_GETPROCESSLBYTYPENS_3      101041803
+#define MON_NODE_STARTDTMPROCESS_1          101041901
+#define MON_NODE_STARTDTMPROCESS_2          101041902
 
 /* Module: config.cxx = 05 */
 
@@ -500,9 +529,10 @@
 #define MON_CONFIGCONT_ADDPROCNAME_1        101050601
 #define MON_CONFIGCONT_ADDCLUSTERDATA_1     101050701
 #define MON_CONFIGCONT_ADDPROCDATA_1        101050801
-#define MON_CONFIGCONT_FINDUNIQUESTRING_1   101050901
+#define MON_CONFIGCONT_GETUNIQUESTRINGID_1  101050901
 #define MON_CONFIGCONT_GETUNIQUESTRIDMAX_1  101051001
-#define MON_CONFIGCONT_STRINGIDTPSTRING_1   101051101
+#define MON_CONFIGCONT_STRINGIDTOSTRING_1   101051101
+#define MON_CONFIGCONT_GETUNIQUESTRING_1    101051201
 
 /* Module: tmsync.cxx = 06 */
 
@@ -536,6 +566,9 @@
 #define MON_DEVICE_UNMOUNT_2                101070302
 #define MON_DEVICE_CONTAINER_1              101070401
 #define MON_DEVICE_CONTAINER_2              101070402
+#define MON_LDEVICE_MOUNT_1                 101070501
+#define MON_LDEVICE_UNMOUNT_1               101070601
+#define MON_LDEVICE_CREATEDEVICE_1          101070701
 
 /* Module: monsonar.cxx = 08 */
 
@@ -577,6 +610,8 @@
 #define MON_CLUSTERCONF_UPDATEDBPNODE_2     101091302
 #define MON_CLUSTERCONF_DELETEDBUSTRING_1   101091401
 #define MON_CLUSTERCONF_DELETEDBUSTRING_2   101091402
+#define MON_CLUSTERCONF_CLUSTERCONFIG_1     101091501
+#define MON_CLUSTERCONF_CLUSTERCONFIG_2     101091502
 
 /* Module: lock.cxx = 10 */
 
@@ -680,6 +715,8 @@
 #define MON_REDIR_USTDINREMOTE_1            101132001
 #define MON_REDIR_USTDINREM_HNDLIN_1        101132101
 #define MON_REDIR_STDOUT_1                  101132201
+#define MON_REDIR_STDOUT_2                  101132202
+#define MON_REDIR_STDOUT_3                  101132203
 #define MON_REDIR_USTDOUT_1                 101132301
 #define MON_REDIR_STDOUT_HNDLOUT_1          101132401
 #define MON_REDIR_ADDTOEPOLL_1              101132501
@@ -786,6 +823,12 @@
 #define MON_REQ_PROCINFO_1                  101182601
 #define MON_REQ_PROCINFOCONT_1              101182701
 #define MON_INTREQ_CHILDDEATH_1             101182801
+#define MON_INTREQ_DUMP_1                   101182901
+#define MON_INTREQ_DUMP_2                   101182902
+#define MON_INTREQ_DUMPCOMPLETE_1           101183001
+#define MON_INTREQ_DUMPCOMPLETE_2           101183002
+#define MON_INTREQ_EVENT_1                  101183101
+#define MON_INTREQ_EVENT_2                  101183102
 
 /* Module: clio.cxx = 19 */
 #define MON_CLIO_ACQUIRE_MSG_1              101190101
@@ -839,6 +882,7 @@
 #define MON_HEALTHCHECK_Q_BLOCK             101230501
 
 #define MON_HEALTHCHECK_STOP_NS_1           101230701
+#define MON_HEALTHCHECK_TIMETOLOGHEALTH     101230702
 
 /* Module: sdtimer.cxx = 24 */
 #define MON_SDTIMER_SOFTDOG_TH_1            101240101
@@ -854,6 +898,7 @@
 #define MON_SDTIMER_UNMOUNTDEVS_1           101240801
 #define MON_SDTIMER_SUSPENDMONITORPROC_1    101240901
 #define MON_SDTIMER_TIMEREXPIRED_1          101241001
+#define MON_SDTIMER_STARTSOFTDOGTIMER_1     101241101
 
 /* Module: wdtimer.cxx = 25 */
 #define MON_WDTIMER_WATCHDOG_TH_1           101250101
@@ -945,6 +990,7 @@
 #define MON_COMMACCEPT_17                   101320117
 #define MON_COMMACCEPT_18                   101320118
 #define MON_COMMACCEPT_19                   101320119
+#define MON_COMMACCEPT_20                   101320120
 
 /* Module: reqnodedown.cxx = 33 */
 #define MON_EXT_NODEDOWN_REQ                101330101
@@ -956,49 +1002,70 @@
 #define MON_PERSISTCONFIG_ADDCONFIG_1       101360101
 
 /* Module: zclient.cxx = 37 */
-#define MON_ZCLIENT_ZCLIENT_1               101370101
-#define MON_ZCLIENT_ZCLIENT_2               101370102
-#define MON_ZCLIENT_ZCLIENT_3               101370103
-#define MON_ZCLIENT_SYNC_STRING_COMP_1      101370201
-#define MON_ZCLIENT_CHECKCLUSTER_1          101370301
-#define MON_ZCLIENT_CHECKCLUSTER_2          101370302
-#define MON_ZCLIENT_CHECKCLUSTERZNODES_1    101370401
-#define MON_ZCLIENT_CHECKCLUSTERZNODES_2    101370402
-#define MON_ZCLIENT_CHECKCLUSTERZNODES_3    101370403
-#define MON_ZCLIENT_GETCLUSTERZNODES_1      101370501
-#define MON_ZCLIENT_GETCLUSTERZNODES_2      101370502
-#define MON_ZCLIENT_REGISTERZNODE_1         101370601
-#define MON_ZCLIENT_SHUTDOWNWORK_1          101370701
-#define MON_ZCLIENT_ZCLIENTTHREAD_1         101370801
-#define MON_ZCLIENT_STARTWORK_1             101370901
-#define MON_ZCLIENT_MONITORZCLUSTER_1       101371001
-#define MON_ZCLIENT_GETZNODEDATA_1          101371101
-#define MON_ZCLIENT_GETZNODEDATA_2          101371102
-#define MON_ZCLIENT_GETZNODEDATA_3          101371103
-#define MON_ZCLIENT_WATCHCLUSTER_1          101371201
-#define MON_ZCLIENT_WATCHCLUSTER_2          101371202
-#define MON_ZCLIENT_SETZNODEWATCH_1         101371301
-#define MON_ZCLIENT_SETZNODEWATCH_2         101371302
-#define MON_ZCLIENT_WATCHNODE_1             101371401
-#define MON_ZCLIENT_ZSESSIONWATCHER_1       101371501
-#define MON_ZCLIENT_ZSESSIONWATCHER_2       101371502
-#define MON_ZCLIENT_CHECKZNODE_1            101371601
-#define MON_ZCLIENT_WATCHNODEDELETE_1       101371701
-#define MON_ZCLIENT_WATCHNODEDELETE_2       101371702
-#define MON_ZCLIENT_WATCHNODEDELETE_3       101371703
-#define MON_ZCLIENT_ISZNODEEXPIRED_1        101371801
-#define MON_ZCLIENT_ISZNODEEXPIRED_2        101371802
-#define MON_ZCLIENT_CHECKMYZNODE_1          101371901
-#define MON_ZCLIENT_CHECKMYZNODE_2          101371902
-#define MON_ZCLIENT_AMICONFIGUREDMASTER_1   101372101
-#define MON_ZCLIENT_AMICONFIGUREDMASTER_2   101372102
-#define MON_ZCLIENT_WAITFORANDRETURNMASTER  101372103
-#define MON_ZCLIENT_CREATEMASTERZNODE       101372104
-#define MON_ZCLIENT_WATCHMASTERNODEDELETE_1 101372105
-#define MON_ZCLIENT_WATCHMASTERNODEDELETE_2 101372106
-#define MON_ZCLIENT_WATCHMASTERNODEDELETE_3 101372107
-#define MON_ZCLIENT_CREATEORSETMASTERWATCH  101372108
-#define MON_ZCLIENT_CREATEORSETMASTERINFO   101372109
+#define MON_ZCLIENT_ZCLIENTTHREAD_1         101370101
+#define MON_ZCLIENT_ZSESSIONWATCHER_1       101370201
+#define MON_ZCLIENT_ZSESSIONWATCHER_2       101370202
+#define MON_ZCLIENT_ZCLIENT_1               101370301
+#define MON_ZCLIENT_ZCLIENT_2               101370302
+#define MON_ZCLIENT_ZCLIENT_3               101370303
+#define MON_ZCLIENT_CONFIGZNODESDELETE_1    101370401
+#define MON_ZCLIENT_CONFIGZNODESGET_1       101370501
+#define MON_ZCLIENT_CONFIGZNODESGET_2       101370502
+#define MON_ZCLIENT_ERRORZNODESGET_1        101370601
+#define MON_ZCLIENT_ERRORZNODESGET_2        101370602
+#define MON_ZCLIENT_ERRORZNODESDELETE_1     101370701
+#define MON_ZCLIENT_ERRORZNODESDELETE_2     101370702
+#define MON_ZCLIENT_ISZNODEEXPIRED_1        101370801
+#define MON_ZCLIENT_ISZNODEEXPIRED_2        101370802
+#define MON_ZCLIENT_WAITFORRETURNMASTER_1   101370901
+#define MON_ZCLIENT_MASTERZNODECREATE_1     101371001
+#define MON_ZCLIENT_MASTERZNODEDELETE_1     101371101
+#define MON_ZCLIENT_MYRUNNINGZNODECHECK_1   101371201
+#define MON_ZCLIENT_MYRUNNINGZNODECHECK_2   101371202
+#define MON_ZCLIENT_SHUTDOWNWORK_1          101371301
+#define MON_ZCLIENT_STARTWORK_1             101371401
+#define MON_ZCLIENT_RUNZNODEWATCHADD_1      101371501
+#define MON_ZCLIENT_RUNZNODEWATCHDELETE_1   101371601
+#define MON_ZCLIENT_WATCHNODE_1             101371701
+#define MON_ZCLIENT_RUNZNODESCHECK_1        101371801
+#define MON_ZCLIENT_RUNZNODESCHECK_2        101371802
+#define MON_ZCLIENT_RUNZNODESDELETE_1       101371901
+#define MON_ZCLIENT_RUNZNODESDELETE_2       101371902
+#define MON_ZCLIENT_RUNZNODESGET_1          101372001
+#define MON_ZCLIENT_RUNZNODESWATCHSET_1     101372101
+#define MON_ZCLIENT_RUNZNODESWATCHSET_2     101372102
+#define MON_ZCLIENT_ZNODECREATE_1           101372201
+#define MON_ZCLIENT_ZNODEDATAGET_1          101372301
+#define MON_ZCLIENT_ZNODEDATAGET_2          101372302
+#define MON_ZCLIENT_ZNODEDELETE_1           101372401
+#define MON_ZCLIENT_ZNODEDELETE_2           101372402
+#define MON_ZCLIENT_ZNODEWATCHSET_1         101372501
+#define MON_ZCLIENT_ZNODEWATCHSET_2         101372502
+#define MON_ZCLIENT_ZNODESTREECREATE_1      101372601
+#define MON_ZCLIENT_ZNODESTREECREATE_2      101372602
+#define MON_ZCLIENT_ZNODESTREECREATE_3      101372603
+#define MON_ZCLIENT_ZNODESTREECREATE_4      101372604
+#define MON_ZCLIENT_ZNODESTREECREATE_5      101372605
+#define MON_ZCLIENT_ZNODESTREECREATE_6      101372606
+#define MON_ZCLIENT_ZNODESTREECREATE_7      101372607
+#define MON_ZCLIENT_ZNODESTREECREATE_8      101372608
+#define MON_ZCLIENT_ERRORCHILDZNODESGET_1   101372701
+#define MON_ZCLIENT_CONFZNODEWATCHADD_1     101372801
+#define MON_ZCLIENT_CONFZNODEWATCHDELETE_1  101372901
+#define MON_ZCLIENT_CONFIGZNODESWATCHSET_1  101373001
+#define MON_ZCLIENT_ERRORZNODEWATCHADD_1    101373101
+#define MON_ZCLIENT_ERRORZNODEWATCHDELETE_1 101373201
+#define MON_ZCLIENT_ERRORZNODESWATCHSET_1   101373301
+#define MON_ZCLIENT_ZNODEWATCHRESET_1       101373401
+#define MON_ZCLIENT_ZNODEWATCHRESET_2       101373402
+#define MON_ZCLIENT_ERRORZNODEDELETE_1      101373501
+#define MON_ZCLIENT_ZNODEWATCHCHILDSET_1    101373601
+#define MON_ZCLIENT_ZNODEWATCHCHILDSET_2    101373602
+#define MON_ZCLIENT_HANDLECHILDZNODE_1      101373701
+#define MON_ZCLIENT_HANDLEERRORZNODES_1     101373801
+#define MON_ZCLIENT_HNDLEERRORCHILDZNODES_1 101373901
+#define MON_ZCLIENT_HANDLEERRORZNODE_1      101374001
+#define MON_ZCLIENT_HNDLERRCHLZNFORZNCHL_1  101374101
 
 /* Module: zconfig.cxx = 38 */
 #define ZCONFIG_ZCONFIG_1                   101380101
diff --git a/core/sqf/export/include/seabed/excep.h b/core/sqf/export/include/seabed/excep.h
index ed9e3f2..06a125b 100644
--- a/core/sqf/export/include/seabed/excep.h
+++ b/core/sqf/export/include/seabed/excep.h
@@ -32,9 +32,22 @@
 //
 // Use these for references.
 //
+#if __cplusplus < 201103L // Standards below C++2011 in which 
+                          // dynamic throw is allowed
+
 #define SB_THROW_FATAL(msg) throw SB_Fatal_Excep(msg)
 #define SB_THROWS_EXCEP(exc) throw (exc)
 #define SB_THROWS_FATAL SB_THROWS_EXCEP(SB_Fatal_Excep)
+ 
+#else  // Starting C++2011,  use noexcept(bool)
+
+#define SB_THROW_FATAL(msg) noexcept(false)
+#define SB_THROWS_EXCEP(exc) noexcept(false)
+#define SB_THROWS_FATAL noexcept(false)
+
+#endif
+
+
 
 //
 // Base-class for seabed exceptions.
diff --git a/core/sqf/export/include/seabed/fs.h b/core/sqf/export/include/seabed/fs.h
index ca0ae79..de2dbf1 100644
--- a/core/sqf/export/include/seabed/fs.h
+++ b/core/sqf/export/include/seabed/fs.h
@@ -293,7 +293,7 @@
 SB_Export void  file_mon_process_shutdown_now();
 SB_Export int   file_mon_process_startup(int sysmsgs)
 SB_THROWS_FATAL SB_DIAG_UNUSED;
-SB_Export int   file_mon_process_startup2(int sysmsgs, int pipeio)
+SB_Export int   file_mon_process_startup2(int sysmsgs, int pipeio, bool stderr_remap=true) // remap std_err to monitor
 SB_THROWS_FATAL SB_DIAG_UNUSED;
 
 SB_Export void  file_test_assert_disable(File_AS_Type *state);
diff --git a/core/sqf/export/include/seabed/ms.h b/core/sqf/export/include/seabed/ms.h
index d1c095b..8190925 100644
--- a/core/sqf/export/include/seabed/ms.h
+++ b/core/sqf/export/include/seabed/ms.h
@@ -367,6 +367,10 @@
     int last_pnid;                          // Last Physical Node ID returned
     bool _fill_1;                           
 } MS_Mon_Zone_Info;
+typedef struct MS_Mon_ClusterInstanceId {
+    int nid;                                // node id of requesting process
+    int pid;                                // process id of requesting process
+} MS_Mon_ClusterInstanceId;
 
 //
 // Note the MS_Mon_MSGTYPE, MS_Mon_REQTYPE, MS_Mon_PROCESSTYPE, and MS_Mon_Msg
@@ -381,7 +385,6 @@
     MS_MsgType_NodeDeleted,
     MS_MsgType_NodeDown,
     MS_MsgType_NodeJoining,
-    MS_MsgType_NodePrepare,
     MS_MsgType_NodeQuiesce,
     MS_MsgType_NodeUp,
     MS_MsgType_Open,
@@ -390,11 +393,7 @@
     MS_MsgType_ReintegrationError,
     MS_MsgType_Service,
     MS_MsgType_Shutdown,
-    MS_MsgType_SpareUp,
-    MS_MsgType_TmRestarted,
-    MS_MsgType_TmSyncAbort,
-    MS_MsgType_TmSyncCommit,
-    MS_MsgType_UnsolicitedMessage
+    MS_MsgType_SpareUp
 } MS_Mon_MSGTYPE;
 typedef enum {
     MS_ReqType_Close = 1,
@@ -403,6 +402,7 @@
     MS_ReqType_Event,
     MS_ReqType_Exit,
     MS_ReqType_Get,
+    MS_ReqType_InstanceId,
     MS_ReqType_Kill,
     MS_ReqType_MonStats,
     MS_ReqType_Mount,
@@ -432,11 +432,8 @@
     MS_ReqType_Shutdown,
     MS_ReqType_ShutdownNs,
     MS_ReqType_Startup,
-    MS_ReqType_Stfsd,
     MS_ReqType_TmLeader,
     MS_ReqType_TmReady,
-    MS_ReqType_TmSync,
-    MS_ReqType_TransInfo,
     MS_ReqType_ZoneInfo
 } MS_Mon_REQTYPE;
 typedef enum {
@@ -523,11 +520,6 @@
     char                 node_name[MS_MON_MAX_PROCESSOR_NAME];
     MS_MON_JOINING_PHASE phase;
 };
-struct MS_Mon_NodePrepare_def {
-    int  nid;
-    char node_name[MS_MON_MAX_PROCESSOR_NAME];
-    int  takeover;
-};
 struct MS_Mon_NodeQuiesce_def {
     int  nid;
     char node_name[MS_MON_MAX_PROCESSOR_NAME];
@@ -570,19 +562,6 @@
     int  pnid;
     char node_name[MS_MON_MAX_PROCESSOR_NAME];
 };
-struct MS_Mon_TmSyncNotice_def {
-    int                 nid[MS_MON_MAX_TM_SYNCS];
-    int                 orig_count;
-    int                 orig_tag[MS_MON_MAX_TM_SYNCS];
-    int                 orig_handle[MS_MON_MAX_TM_SYNCS];
-    int                 count;
-    int                 handle[MS_MON_MAX_TM_SYNCS];
-};
-struct MS_Mon_TmRestarted_def {
-    int  nid;
-    int  pnid;
-    char node_name[MS_MON_MAX_PROCESSOR_NAME];
-};
 // TODO: make less kludgy
 #if __WORDSIZE == 64
 enum { _MS_REQ_FILL = 3 };
@@ -604,14 +583,11 @@
         struct MS_Mon_NodeDown_def            down;
         struct MS_Mon_NodeJoining_def         joining;
         struct MS_Mon_Open_def                open;
-        struct MS_Mon_NodePrepare_def         prepare;
         struct MS_Mon_NewProcess_Notice_def   process_created;
         struct MS_Mon_NodeQuiesce_def         quiesce;
         struct MS_Mon_Shutdown_def            shutdown;
         struct MS_Mon_SpareUp_def             spare_up;
-        struct MS_Mon_TmSyncNotice_def        tmsync;
         struct MS_Mon_NodeUp_def              up;
-        struct MS_Mon_TmRestarted_def         tmrestarted;        
         struct MS_Mon_NodeAdded_def           added;
         struct MS_Mon_NodeChanged_def         changed;
         struct MS_Mon_NodeDeleted_def         deleted;
@@ -895,6 +871,13 @@
 SB_DIAG_UNUSED;
 
 //
+// Call this to get cluster id and instance id
+//
+SB_Export int msg_mon_get_instance_id(int *cluster_id,
+                                      int *instance_id)
+SB_DIAG_UNUSED;
+
+//
 // Call this to get monitor stats
 //
 SB_Export int msg_mon_get_monitor_stats(MS_Mon_Monitor_Stats_Type *stats)
@@ -1290,7 +1273,7 @@
 // sysmsgs: want system messages?
 // pipeio: want pipe io?
 //
-SB_Export int msg_mon_process_startup3(int sysmsgs, int pipeio)
+SB_Export int msg_mon_process_startup3(int sysmsgs, int pipeio, bool remap_stderr=true)
 SB_THROWS_FATAL SB_DIAG_UNUSED;
 
 //
diff --git a/core/sqf/export/include/trafconf/trafconfig.h b/core/sqf/export/include/trafconf/trafconfig.h
index 7888910..384980b 100644
--- a/core/sqf/export/include/trafconf/trafconfig.h
+++ b/core/sqf/export/include/trafconf/trafconfig.h
@@ -132,7 +132,8 @@
 {
     int  nid;                                   // Node Id (logical)
     int  pnid;                                  // Physical Node ID
-    char node_name[TC_PERSIST_PROCESSOR_NAME_MAX]; // hostname
+    char node_name[TC_PERSIST_PROCESSOR_NAME_MAX]; // short hostname
+    char domain_name[TC_PERSIST_PROCESSOR_NAME_MAX]; // domain name
     int  excluded_first_core;                   // First or only core assigned
     int  excluded_last_core;                    // Last core assigned or -1
     int  first_core;                            // First or only core assigned
@@ -144,7 +145,8 @@
 typedef struct TcPhysicalNodeConfiguration_s
 {
     int  pnid;                                  // Physical Node ID
-    char node_name[TC_PERSIST_PROCESSOR_NAME_MAX]; // hostname
+    char node_name[TC_PERSIST_PROCESSOR_NAME_MAX]; // short hostname
+    char domain_name[TC_PERSIST_PROCESSOR_NAME_MAX]; // domain name
     int  excluded_first_core;                   // First or only core assigned
     int  excluded_last_core;                    // Last core assigned or -1
     int  spare_count;                           // Number of entries in spare_pnid[]
diff --git a/core/sqf/mondump b/core/sqf/mondump
index fba5919..6a3beae 100755
--- a/core/sqf/mondump
+++ b/core/sqf/mondump
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -20,19 +20,19 @@
 #
 # @@@ END COPYRIGHT @@@
 #
-# mondump <pid> to [<node>:]<core> using <tmp-dir> if <node>
+# mondump <pid> to <core>
 #
 if [ $# -lt 2 ]; then
-	echo "usage: $0 <pid> <core> [<node> <tmp-dir>]"
-	exit 1
+    echo "usage: $0 <pid> <core>"
+    exit 1
 fi
 
 # change this to 1 for tracing to /tmp/mondump<pid>
 trace=0
 
 if [ $trace = 1 ]; then
-	TMP=/tmp/mondump$$
-	trap "rm -f ${TMP}; exit 1" 1 2 13 15
+    TMP=/tmp/mondump$$
+    trap "rm -f ${TMP}; exit 1" 1 2 13 15
 fi
 
 #
@@ -42,39 +42,25 @@
 core=$2
 tgtcore=$core
 if [ $trace = 1 ]; then
-	echo "input: pid=$pid" >> $TMP
-	echo "input: core=$core" >> $TMP
+    echo "input: pid=$pid" >> $TMP
+    echo "input: core=$core" >> $TMP
 fi
-shift 2
-node=""
-tmpdir=""
-if [ $# -ge 2 ]; then
-        node=$1
-        tmpdir=$2
-	if [ $trace = 1 ]; then
-		echo "input: node=$node" >> $TMP
-		echo "input: tmpdir=$tmpdir" >> $TMP
-	fi
-        shift 2
-fi
+
 dir=`dirname $core`
 core=`basename $core`
-if [ ! -z $node ]; then
-        dir=$tmpdir
-fi
 
 #
 # create core-file
 #
 if [ $trace = 1 ]; then
-	echo "" >> $TMP
-	echo "create core-file" >> $TMP
-	echo "cd $dir" >> $TMP
-	echo "gdb" >> $TMP
-	echo "  attach $pid" >> $TMP
-	echo "  gcore $core" >> $TMP
-	echo "  detach" >> $TMP
-	echo "" >> $TMP
+    echo "" >> $TMP
+    echo "create core-file" >> $TMP
+    echo "cd $dir" >> $TMP
+    echo "gdb" >> $TMP
+    echo "  attach $pid" >> $TMP
+    echo "  gcore $core" >> $TMP
+    echo "  detach" >> $TMP
+    echo "" >> $TMP
 fi
 
 cd $dir
@@ -85,32 +71,20 @@
 eof
 ret=$?
 if [ $trace = 1 ]; then
-	echo "gdb exit $ret" >> $TMP
+    echo "gdb exit $ret" >> $TMP
 fi
 
 if [ $ret = 0 ]; then
-        if [ -z $node ]; then
-		# make sure file created
-		if [ ! -e $dir/$core ]; then
-			ret=1
-			if [ $trace = 1 ]; then
-				echo "$dir/$core were not created" >> $TMP
-			fi
-		fi
-        else
-		if [ $trace = 1 ]; then
-			echo "pdcp -w $node $dir/$core $tgtcore" >> $TMP
-		fi
-		pdcp -w $node $dir/$core $tgtcore
-		ret=$?
-		if [ $trace = 1 ]; then
-			echo "pdcp exit $ret" >> $TMP
-		fi
-		rm -f $dir/$core
+    # make sure file created
+    if [ ! -e $dir/$core ]; then
+        ret=1
+        if [ $trace = 1 ]; then
+            echo "$dir/$core were not created" >> $TMP
         fi
+    fi
 fi
 
 if [ $trace = 1 ]; then
-	echo "exit $ret" >> $TMP
+    echo "exit $ret" >> $TMP
 fi
 exit $ret
diff --git a/core/sqf/monitor/linux/clio.cxx b/core/sqf/monitor/linux/clio.cxx
index d3034ea..92c8cc5 100644
--- a/core/sqf/monitor/linux/clio.cxx
+++ b/core/sqf/monitor/linux/clio.cxx
@@ -371,7 +371,6 @@
                                errno, strerror(errno));
         la_node_name[0] = '\0';
     }
-
     char *tmpptr = la_node_name;
     while ( *tmpptr )
     {
@@ -431,7 +430,7 @@
         sprintf(ip_port_fname,"%.*s/monitor.port.%d.%s",
                 (int)(sizeof(ip_port_fname)-(sizeof("/monitor.port..")+11
                                         +strlen(la_node_name))),
-                getenv("MPI_TMPDIR"),lv_MyNID,la_node_name);
+                getenv("TRAF_LOG"),lv_MyNID,la_node_name);
     } else 
     {
         // It's a real cluster
@@ -441,7 +440,7 @@
         sprintf(ip_port_fname,"%.*s/monitor.port.%s",
                 (int)(sizeof(ip_port_fname)-(sizeof("/monitor.port.")+11
                                         +strlen(la_short_node_name))),
-                getenv("MPI_TMPDIR"), la_short_node_name);
+                getenv("TRAF_LOG"), la_short_node_name);
 
     }
     // Assume nid zero if the global Seabed nid variable is not initialized
@@ -825,7 +824,9 @@
                             +(lv_idx*sizeof(SharedMsgDef)));
 
     // Bug catcher: shared buffer pid and verifier must match my process
-    LIOTM_assert((iv_pid ==-1 && iv_verifier ==-1) ||
+    //              unless it is a notice from monitor process
+    LIOTM_assert((lv_type == MC_NoticeReady) ||
+                 (iv_pid ==-1 && iv_verifier ==-1) ||
                  (iv_pid == lv_m->trailer.OSPid && iv_verifier == -1) ||
                  (iv_pid == lv_m->trailer.OSPid && 
                   iv_verifier == lv_m->trailer.verifier) ||
@@ -1066,27 +1067,30 @@
         else
         {
             iv_mpid = ((SharedMemHdr*)ip_cshm)->mPid;
-            LIOTM_assert(iv_mpid > 0);
-            if (cv_trace)
-                trace_where_printf(WHERE, "shared-memory=%p, monitor pid=%d, nid=%d\n"
-                                       , ip_cshm, iv_mpid, iv_nid);
-
-            iv_qid = msgget( lv_sharedSegKey, SQ_LIO_MSQ_PERMISSIONS );
-            if (iv_qid == -1) {
-                lv_errno = errno;
-                perror( "failed msgget()" );
+            if (iv_mpid > 0)
+            {
                 if (cv_trace)
-                    trace_where_printf(WHERE, "failed msgget() errno=%d(%s)\n", lv_errno, strerror(lv_errno));
-                // detach from shared memory
-                shmdt(ip_cshm);
-                errno = lv_errno;
-                ip_cshm = NULL;
-                iv_qid = iv_mpid = 0;
+                    trace_where_printf(WHERE, "shared-memory=%p, monitor pid=%d, nid=%d\n"
+                                           , ip_cshm, iv_mpid, iv_nid);
+    
+                iv_qid = msgget( lv_sharedSegKey, SQ_LIO_MSQ_PERMISSIONS );
+                if (iv_qid == -1) {
+                    lv_errno = errno;
+                    perror( "failed msgget()" );
+                    if (cv_trace)
+                        trace_where_printf(WHERE, "failed msgget() errno=%d(%s)\n", lv_errno, strerror(lv_errno));
+                    // detach from shared memory
+                    shmdt(ip_cshm);
+                    errno = lv_errno;
+                    ip_cshm = NULL;
+                    iv_qid = iv_mpid = 0;
+                }
+                else {
+                    ip_cshm_end = ip_cshm + lv_shsize;
+                    iv_initted = lv_ret = true;
+                }
             }
-            else {
-                ip_cshm_end = ip_cshm + lv_shsize;
-                iv_initted = lv_ret = true;
-            }
+            // else return false and the caller handle retries
         }
     }
 
@@ -1186,15 +1190,11 @@
     case MsgType_NodeDown:
     case MsgType_NodeJoining:
     case MsgType_NodeQuiesce:
-    case MsgType_NodePrepare:
     case MsgType_NodeUp:
     case MsgType_SpareUp:
     case MsgType_ProcessDeath:
     case MsgType_ReintegrationError:
     case MsgType_Shutdown:
-    case MsgType_TmRestarted:
-    case MsgType_TmSyncAbort:
-    case MsgType_TmSyncCommit:
         if (cv_trace)
             trace_where_printf(WHERE,
                               "notice %d received\n",
@@ -1220,17 +1220,6 @@
         }
         break;
 
-    case MsgType_UnsolicitedMessage:
-        if (cv_trace)
-            trace_where_printf(WHERE,
-                               "Unsolicited msg, type=%d received\n",
-                               pp_msg->u.request.type);
-        if (ip_unsol_cb)
-            ip_unsol_cb(pp_msg, size_of_msg(pp_msg));
-        else
-            lv_ret = put_on_notice_list( pp_msg, size_of_msg(pp_msg));
-        break;
-
     default:
         if (cv_trace)
             trace_where_printf(WHERE,
@@ -1863,10 +1852,6 @@
         lv_len = lv_preamble + sizeof(pp_msg->u.request.u.joining);
         break;
 
-    case MsgType_NodePrepare:
-        lv_len = lv_preamble + sizeof(pp_msg->u.request.u.prepare);
-        break;
-
     case MsgType_NodeQuiesce:
         lv_len = lv_preamble + sizeof(pp_msg->u.request.u.quiesce);
         break;
@@ -1899,22 +1884,6 @@
         lv_len = lv_preamble + sizeof(pp_msg->u.request.u.spare_up);
         break;
 
-    case MsgType_TmRestarted:
-        lv_len = lv_preamble + sizeof(pp_msg->u.request.u.tm_restart);
-        break;
-
-    case MsgType_TmSyncAbort:
-    case MsgType_TmSyncCommit:
-        lv_len = lv_preamble + sizeof(pp_msg->u.request.u.tm_sync_notice);
-        break;
-
-    case MsgType_UnsolicitedMessage:
-        if (reply)
-            lv_len = lv_preamble + sizeof(pp_msg->u.reply.u.unsolicited_tm_sync);
-        else
-            lv_len = lv_preamble + sizeof(pp_msg->u.request.u.unsolicited_tm_sync);
-        break;
-
     case MsgType_Service:
         if (reply) {
 
@@ -1958,12 +1927,6 @@
             case ReplyType_Startup:
                 lv_len = lv_preamble + sizeof(pp_msg->u.reply.u.startup_info);
                 break;
-            case ReplyType_TmSync:
-                lv_len = lv_preamble + sizeof(pp_msg->u.reply.u.tm_sync);
-                break;
-            case ReplyType_TransInfo:
-                lv_len = lv_preamble + sizeof(pp_msg->u.reply.u.trans_info);
-                break;
             case ReplyType_ZoneInfo:
                 lv_len = lv_preamble + sizeof(pp_msg->u.reply.u.zone_info);
                 break;
@@ -2053,19 +2016,12 @@
             case ReqType_Startup:
                 lv_len = lv_preamble + sizeof(pp_msg->u.request.u.startup);
                 break;
-            //case ReqType_Stfsd:
             case ReqType_TmLeader:
                 lv_len = lv_preamble + sizeof(pp_msg->u.request.u.leader);
                 break;
             case ReqType_TmReady:
                 lv_len = lv_preamble + sizeof(pp_msg->u.request.u.tm_ready);
                 break;
-            case ReqType_TmSync:
-                lv_len = lv_preamble + sizeof(pp_msg->u.request.u.tm_sync);
-                break;
-            case ReqType_TransInfo:
-                lv_len = lv_preamble + sizeof(pp_msg->u.request.u.trans_info);
-                break;
             case ReqType_ZoneInfo:
                 lv_len = lv_preamble + sizeof(pp_msg->u.request.u.zone_info);
                 break;
@@ -2259,10 +2215,10 @@
     "Close",
     "Event",
     "NodeAdded",
+    "NodeChanged",
     "NodeDeleted",
     "NodeDown",
     "NodeJoining",
-    "NodePrepare",
     "NodeQuiesce",
     "NodeUp",
     "Open",
@@ -2272,24 +2228,27 @@
     "Service",
     "Shutdown",
     "SpareUp",
-    "TmRestarted",
-    "TmSyncAbort",
-    "TmSyncCommit",
-    "UnsolicitedMessage",
     "invalid"
 };
 
 const char * Local_IO_To_Monitor::reqTypes_[] = {
-    "",
+    "invalid",
     "Close",
+    "DelProcessNs",
     "Dump",
     "Event",
     "Exit",
     "Get",
+    "InstanceId",
     "Kill",
     "MonStats",
     "Mount",
+    "NameServerAdd",
+    "NameServerDelete",
+    "NameServerStart",
+    "NameServerStop",
     "NewProcess",
+    "NewProcessNs",
     "NodeAdd",
     "NodeDelete",
     "NodeDown",
@@ -2305,35 +2264,34 @@
     "PNodeInfo",
     "ProcessInfo",
     "ProcessInfoCont",
+    "ProcessInfoNs",
     "Set",
     "Shutdown",
+    "ShutdownNs",
     "Startup",
-    "Stfsd",
     "TmLeader",
     "TmReady",
-    "TmSync",
-    "TransInfo",
     "ZoneInfo",
     "invalid"
 };
 
 const char * Local_IO_To_Monitor::replyTypes_[] = {
     "Generic",
+    "DelProcessNs",
     "Dump",
     "Get",
     "MonStats",
     "Mount",
     "NewProcess",
+    "NewProcessNs",
     "NodeInfo",
     "NodeName",
     "Open",
     "OpenInfo",
     "PNodeInfo",
     "ProcessInfo",
-    "Stfsd",
+    "ProcessInfoNs",
     "Startup",
-    "TmSync",
-    "TransInfo",
     "ZoneInfo",
     "invalid"
 };
diff --git a/core/sqf/monitor/linux/cluster.cxx b/core/sqf/monitor/linux/cluster.cxx
index a83f10e..1dbcb65 100644
--- a/core/sqf/monitor/linux/cluster.cxx
+++ b/core/sqf/monitor/linux/cluster.cxx
@@ -135,10 +135,6 @@
 #endif
 const char *EpollEventString( __uint32_t events );
 const char *EpollOpString( int op );
-const char *NodePhaseString( NodePhase phase );
-#ifdef NAMESERVER_PROCESS
-#define MPI_Abort(a,b) abort()
-#endif
 
 const char *NodePhaseString( NodePhase phase )
 {
@@ -152,12 +148,6 @@
         case Phase_Activating:
             str = "Phase_Activating";
             break;
-        case Phase_SoftDown:
-            str = "Phase_SoftDown";
-            break;
-        case Phase_SoftUp:
-            str = "Phase_SoftUp";
-            break;
         default:
             str = "NodePhase - Undefined";
             break;
@@ -253,37 +243,9 @@
             {
                 spareNode->SetState( State_Up );
             }
-#ifndef NAMESERVER_PROCESS
-            if ( tmCount )
-            {
-                // Send node prepare notice to local DTM processes
-                lnode = spareNode->GetFirstLNode();
-                for ( ; lnode; lnode = lnode->GetNextP() )
-                {
-                    lnode->PrepareForTransactions( downNode->GetPNid() != spareNode->GetPNid() );
-                }
-            }
-#else
             ResetIntegratingPNid();
-#endif
         }
 
-#ifndef NAMESERVER_PROCESS
-        if ( downNode->GetPNid() != spareNode->GetPNid() )
-        {
-            // we need to abort any active TmSync
-            if (( MyNode->GetTmSyncState() == SyncState_Start    ) ||
-                ( MyNode->GetTmSyncState() == SyncState_Continue ) ||
-                ( MyNode->GetTmSyncState() == SyncState_Commit   )   )
-            {
-                MyNode->SetTmSyncState( SyncState_Abort );
-                Monitor->SetAbortPendingTmSync();
-                if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
-                   trace_printf("%s@%d" " - Node "  "%d" " TmSyncState updated (" "%d" ")" "\n", method_name, __LINE__, MyPNID, MyNode->GetTmSyncState());
-            }
-        }
-#endif
-
         if (trace_settings & TRACE_INIT)
         {
             trace_printf( "%s@%d - Spare node activating! pnid=%d, name=(%s)\n"
@@ -327,6 +289,11 @@
     const char method_name[] = "CCluster::NodeTmReady";
     TRACE_ENTRY;
 
+    if ( ! MyNode->IsSpareNode() && MyNode->GetPhase() != Phase_Ready )
+    {
+        MyNode->CheckActivationPhase();
+    }
+
     if (trace_settings & TRACE_INIT)
     {
         trace_printf( "%s@%d - nid=%d\n", method_name, __LINE__, nid );
@@ -336,39 +303,27 @@
 
     if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
     {
-        trace_printf( "%s@%d - TmReady, nid=%d, tm count=%d, soft node down=%d, LNodesCount=%d\n"
+        trace_printf( "%s@%d - TmReady, nid=%d, tm count=%d, LNodesCount=%d\n"
                     , method_name, __LINE__
                     , nid
                     , tmReadyCount_
-                    , MyNode->IsSoftNodeDown()
                     , MyNode->GetLNodesCount() );
     }
 
-    MyNode->StartPStartDPersistentDTM( nid );
+    if (IsRealCluster)
+    {
+        MyNode->StartPStartDPersistentDTM( nid );
+    }
 
     if ( MyNode->GetLNodesCount() == tmReadyCount_ )
     {
-        if ( MyNode->IsSoftNodeDown() )
-        {
-            MyNode->ResetSoftNodeDown();
+        char la_buf[MON_STRING_BUF_SIZE];
+        sprintf(la_buf, "[%s], Node activated! pnid=%d, name=(%s) \n", method_name, MyNode->GetPNid(), MyNode->GetName());
+        mon_log_write(MON_CLUSTER_NODE_TM_READY_2, SQ_LOG_INFO, la_buf);
 
-            MyNode->SetPhase( Phase_Ready );
-
-            char la_buf[MON_STRING_BUF_SIZE];
-            sprintf( la_buf, "[%s], Soft Node up! pnid=%d, name=(%s)\n"
-                   , method_name, MyNode->GetPNid(), MyNode->GetName());
-            mon_log_write(MON_CLUSTER_NODE_TM_READY_1, SQ_LOG_INFO, la_buf);
-        }
-        else
-        {
-            char la_buf[MON_STRING_BUF_SIZE];
-            sprintf(la_buf, "[%s], Node activated! pnid=%d, name=(%s) \n", method_name, MyNode->GetPNid(), MyNode->GetName());
-            mon_log_write(MON_CLUSTER_NODE_TM_READY_2, SQ_LOG_INFO, la_buf);
-
-            // Let other monitors know the node is up
-            CReplActivateSpare *repl = new CReplActivateSpare( MyPNID, -1 );
-            Replicator.addItem(repl);
-        }
+        // Let other monitors know the node is up
+        CReplActivateSpare *repl = new CReplActivateSpare( MyPNID, -1 );
+        Replicator.addItem(repl);
     }
 
     TRACE_EXIT;
@@ -382,8 +337,11 @@
 
     if (trace_settings & TRACE_INIT)
     {
-        trace_printf( "%s@%d - spare node %s pnid=%d\n"
-                    , method_name, __LINE__, spareNode->GetName(), spareNode->GetPNid() );
+        trace_printf( "%s@%d - spare node %s pnid=%d, state=%s\n"
+                    , method_name, __LINE__
+                    , spareNode->GetName()
+                    , spareNode->GetPNid()
+                    , StateString(spareNode->GetState()) );
     }
 
     assert( spareNode->GetState() == State_Up );
@@ -395,8 +353,9 @@
         lnode->Up();
     }
 
+    spareNode->SetPhase( Phase_Ready );
     spareNode->SetActivatingSpare( false );
-    ResetIntegratingPNid();
+    HealthCheck.triggerTimeToLogHealth();
 
     TRACE_EXIT;
 }
@@ -413,7 +372,7 @@
          strcat( IntegratingMonitorPort, ":");
          strcat( IntegratingMonitorPort, monitorPort);
 
-          if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+          if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
           {
                trace_printf("%s@%d" " (MasterMonitor) UpdateMonitorPort Updating IntegratingMonitorPort to %s\n",
                              method_name, __LINE__,IntegratingMonitorPort );
@@ -445,16 +404,26 @@
 {
     const char method_name[] = "CCluster::AssignMonitorLeader";
     TRACE_ENTRY;
+     
+    if (!IsAgentMode || !ZClientEnabled)
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d" " - (MasterMonitor) not in AgentMode or zookeeper not enabled, returning\n"
+                        , method_name, __LINE__);
+        }
+        TRACE_EXIT;
+        return;
+    }
 
-    int i = 0;
     int rc = 0;
-    
-    int monitorLeaderPNid = -1;
     CNode *node = NULL;
+    CNode *failedMasterNode = Nodes->GetNode( (char *)failedMaster ); 
+    int failedMasterPNid = failedMasterNode ? failedMasterNode->GetPNid() : -1;
     
     if (failedMaster == NULL)
     {
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
         {
             trace_printf( "%s@%d" " - (MasterMonitor) failedMaster is NULL, returning\n" , method_name, __LINE__);
         }
@@ -462,103 +431,87 @@
         return;
     }
 
-    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+    if (failedMasterNode == NULL)
     {
-        trace_printf( "%s@%d" " - (MasterMonitor) "  " MonitorLeader (%s) failed!\n"
-                    , method_name, __LINE__, failedMaster );
-    }
-
-    if (!IsAgentMode || !ZClientEnabled)
-    {
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
         {
-               trace_printf( "%s@%d" " - (MasterMonitor) not AgentMode or zookeeper not enabled, returning\n"
-                 , method_name, __LINE__);
+            trace_printf( "%s@%d" " - (MasterMonitor) failedMasterNode is NULL, returning\n" , method_name, __LINE__);
         }
         TRACE_EXIT;
         return;
     }
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d" " - (MasterMonitor) MonitorLeader %s (pnid=%d) failed!\n"
+                    , method_name, __LINE__
+                    , failedMasterNode->GetName()
+                    , failedMasterPNid );
+    }
+
     // delete old master if needed
-    const char *masterMonitor = ZClient->WaitForAndReturnMaster (false);
+    const char *masterMonitor = ZClient->MasterWaitForAndReturn(false);
     if (masterMonitor)
     {   
-        // IFF it is the failed master, delete, do not delete anything else because we could delete a new master
+        // If it is the failed master, delete, do not delete anything else because we could delete a new master
         if (strcmp (masterMonitor, failedMaster) == 0)
         {
-            ZClient->WatchNodeMasterDelete (failedMaster);
-            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+            ZClient->MasterZNodeDelete( failedMaster );
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
             {
                  trace_printf( "%s@%d" " - (MasterMonitor) deleting master %s\n"
                               , method_name, __LINE__, masterMonitor );
              }
         }
-        // no worries
-        else
-        {            
-             rc = ZClient->WatchMasterNode( masterMonitor ); 
-             UpdateMonitorPort ( masterMonitor );
-             if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
-             {
-                   trace_printf( "%s@%d" " - (MasterMonitor) master did not match, set watch (rc = %d) and returning %s\n"
-                     , method_name, __LINE__, rc, masterMonitor );
-             }
-             TRACE_EXIT;
-             return;
-         }
+        // else no worries
     }
 
-    // choose a new master
-    if (((MyNode) && ((MyNode->GetState() != State_Up) ||(!IAmIntegrated))) || (MyNode == NULL /* not set up yet*/))
+    if ((MyNode && 
+        (MyNode->IsPendingNodeDown()
+      || MyNode->GetState() != State_Up
+      || !IAmIntegrated)) )
     {
         // Do not let this monitor participate in choosing the master.  It can wait until an integrated
         // monitor makes a decision.
-         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
-         {
-              trace_printf( "%s@%d" " - (MasterMonitor) This Node is not set up yet and will not participate in master choice!\n"
-                    , method_name, __LINE__);
-         }
-         
-         // wait until another monitor choose a master
-         const char *masterMonitor = ZClient->WaitForAndReturnMaster (true);
-         if (masterMonitor)
-         {
-             rc = ZClient->WatchMasterNode( masterMonitor ); 
-             if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
-             {
-                  trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader WatchMasterNode with rc = %d\n", method_name, __LINE__, rc);
-             }
-
-          UpdateMonitorPort ( masterMonitor );
-          }
-          TRACE_EXIT;
-          return;
-    }
- 
-    // For all monitors who are up - choose the master using the same logic
-    for (i=0; i<GetConfigPNodesMax(); i++)
-    {
-        monitorLeaderPNid++; // set to -1, so this will bump it to 0 on the first time through
-
-        if (monitorLeaderPNid == GetConfigPNodesMax())
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
         {
-             if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
-             {
-                 trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader  Unable to create or set watch\n", method_name, __LINE__);
-             }
-             char    buf[MON_STRING_BUF_SIZE];
-             snprintf( buf, sizeof(buf)
-                           , "[%s], Unable to create or set watch on master, hit max\n"
-                           , method_name );
-            mon_log_write(MON_CLUSTER_ASSIGNMONITORLEADER_1, SQ_LOG_ERR, buf);
-            break;
+             trace_printf( "%s@%d" " - (MasterMonitor) This Node %s is not up yet or going down and will not participate in master choice!\n"
+                   , method_name, __LINE__
+                   , MyNode->GetName() );
         }
-
-        if (Node[monitorLeaderPNid] == NULL)
+        
+        if (failedMasterPNid != MyPNID)
         {
+            // wait until another monitor choose a master
+            const char *masterMonitor = ZClient->MasterWaitForAndReturn(true);
+            if (masterMonitor)
+            {
+                UpdateMonitorPort ( masterMonitor );
+            }
+        }
+        TRACE_EXIT;
+        return;
+    }
+
+    int masterPNid = -1;
+    // For all monitors who are up - choose the master using the same logic
+    for ( int i = 0; i < GetConfigPNodesCount(); i++)
+    {
+        masterPNid = indexToPnid_[i];
+
+        if (masterPNid == -1)
+        {
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d - indexToPnid_[%d]=%d\n"
+                            , method_name, __LINE__
+                            , i
+                            , indexToPnid_[i] );
+            }
             continue;
         }
 
-        node = Node[monitorLeaderPNid];
+        node = Node[masterPNid];
 
         // skip this node
         if ( node == NULL )
@@ -566,78 +519,146 @@
             continue; 
         }
 
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
         {
-            trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, isSoftNodeDown=%d\n"
+            trace_printf( "%s@%d - Node pnid=%d (%s), state=%s, phase=%s, "
+                          "isPendingNodeDown=%d\n"
                         , method_name, __LINE__
                         , node->GetPNid()
                         , node->GetName()
+                        , StateString(node->GetState())
                         , NodePhaseString(node->GetPhase())
-                        , node->IsSoftNodeDown());
+                        , node->IsPendingNodeDown() );
         }
 
         if ( node->IsSpareNode() ||
-             node->IsSoftNodeDown() ||
+             node->IsPendingNodeDown() ||
              node->GetState() != State_Up ||
              node->GetPhase() != Phase_Ready )
         {
             continue; // skip this node for any of the above reasons
         }
 
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
         {
-            trace_printf("%s@%d" " - Node "  "%d" " is the new monitorLeaderPNid." "\n", method_name, __LINE__, node->GetPNid());
+            trace_printf( "%s@%d - Master Monitor candidate is %s, pnid=%d\n"
+                        , method_name, __LINE__
+                        , node->GetName(), node->GetPNid() );
         }
 
-        const char *masterMonitor = ZClient->WaitForAndReturnMaster (false);
+        const char *masterMonitor = ZClient->MasterWaitForAndReturn(false);
     
-        //nobody has written it yet, we don't want to overwrite anything
+        //nobody has written it yet
         if (!masterMonitor)
         {
-            rc = ZClient->CreateMasterZNode ( node->GetName() );
-            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
-            {
-                trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader CreateMasterZNode with rc = %d\n", method_name, __LINE__, rc);
+            if (!masterMonitor && node->GetPNid() == MyPNID)
+            { // I'm the new master
+                rc = ZClient->MasterZNodeCreate( node->GetName() );  
+                if ( rc == ZOK )
+                {
+                    strcpy( MasterMonitorName, node->GetName() );
+        
+                    char    buf[MON_STRING_BUF_SIZE];
+                    snprintf( buf, sizeof(buf)
+                                      , "[%s], Master Monitor is %s on node %d\n"
+                                      , method_name, node->GetName(), node->GetPNid() );
+                    mon_log_write(MON_CLUSTER_ASSIGNMONITORLEADER_1, SQ_LOG_INFO, buf);
+                }
+                else
+                {
+                     char    buf[MON_STRING_BUF_SIZE];
+                     snprintf( buf, sizeof(buf)
+                               , "[%s], Unable to create or set watch on master node %s\n"
+                               , method_name, node->GetName() );
+                     mon_log_write(MON_CLUSTER_ASSIGNMONITORLEADER_2, SQ_LOG_ERR, buf);
+                }
             }
+            else
+            {
+                int retries = 0;
+                bool found = false;
+                while (!found && (retries < ZCLIENT_MASTER_ZNODE_RETRY_COUNT)) 
+                {
+                    // the current node candidate is not my node
+                    // so check for the current candidate to register as the master
+                    masterMonitor = ZClient->MasterWaitForAndReturn(false);
+                    if (!masterMonitor)
+                    { // no master registered
+                        if (node->GetState() == State_Down
+                         || node->IsPendingNodeDown() )
+                        {
+                            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                            {
+                                trace_printf( "%s@%d (MasterMonitor) Current "
+                                              "candidate node %s, state=%s, pendingNodeDown=%d\n"
+                                            , method_name, __LINE__
+                                            , node->GetName()
+                                            , StateString(node->GetState())
+                                            , node->IsPendingNodeDown() );
+                            }
+                            break;
+                        }
+                        else
+                        {
+                            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                            {
+                                trace_printf( "%s@%d (MasterMonitor) No masterMonitor registered, "
+                                              "candidate is %s, pnid=%d, state=%s, pendingNodeDown=%d\n"
+                                            , method_name, __LINE__
+                                            , node->GetName(), node->GetPNid()
+                                            , StateString(node->GetState())
+                                            , node->IsPendingNodeDown() );
+                            }
+                            usleep(1000000); // sleep for a second as to not overwhelm the system   
+                            retries++;
+                            continue;
+                        }
+                    }
+                    else
+                    {
+                        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                        {
+                            trace_printf( "%s@%d (MasterMonitor) Registered masterMonitor=%s, "
+                                          "candidate node %s, pnid=%d\n"
+                                        , method_name, __LINE__
+                                        , masterMonitor
+                                        , node->GetName(), node->GetPNid() );
+                        }
+        
+                        strcpy( MasterMonitorName, masterMonitor );
+        
+                        char    buf[MON_STRING_BUF_SIZE];
+                        snprintf( buf, sizeof(buf)
+                                          , "[%s], Master Monitor is %s on node %d\n"
+                                          , method_name, node->GetName(), node->GetPNid() );
+                        mon_log_write(MON_CLUSTER_ASSIGNMONITORLEADER_3, SQ_LOG_INFO, buf);
+                        found = true;
+                    }
+                }
+                if (!masterMonitor)
+                {
+                    // the current node candidate no longer a candidate
+                    continue;
+                }
+            }
+        }
+        else
+        {
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d (MasterMonitor) MasterMonitorName=%s, masterMonitor=%s\n"
+                            , method_name, __LINE__
+                            , MasterMonitorName
+                            , masterMonitor );
+            }
+
+            strcpy( MasterMonitorName, masterMonitor );
+
             char    buf[MON_STRING_BUF_SIZE];
             snprintf( buf, sizeof(buf)
                               , "[%s], Master Monitor is %s on node %d\n"
                               , method_name, node->GetName(), node->GetPNid() );
-            mon_log_write(MON_CLUSTER_ASSIGNMONITORLEADER_2, SQ_LOG_INFO, buf);
-
-            if ( (rc == ZOK) || (rc == ZNODEEXISTS) )
-            {
-                 rc = ZClient->WatchMasterNode( node->GetName() ); 
-                 if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
-                 {
-                     trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader WatchMasterNode with rc = %d\n", method_name, __LINE__, rc);
-                 }
-            }
-            else
-            {
-                 if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
-                 {
-                     trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader  Unable to create or set watch\n", method_name, __LINE__);
-                 }
-                 char    buf[MON_STRING_BUF_SIZE];
-                 snprintf( buf, sizeof(buf)
-                           , "[%s], Unable to create or set watch on master node %s\n"
-                           , method_name, node->GetName() );
-                 mon_log_write(MON_CLUSTER_ASSIGNMONITORLEADER_3, SQ_LOG_ERR, buf);
-            }
-       }
-       else
-       {
-           rc = ZClient->WatchMasterNode( masterMonitor ); 
-           char    buf[MON_STRING_BUF_SIZE];
-           snprintf( buf, sizeof(buf)
-                          , "[%s], Master Monitor is %s\n"
-                          , method_name, masterMonitor);
-           mon_log_write(MON_CLUSTER_ASSIGNMONITORLEADER_4, SQ_LOG_INFO, buf);
-           if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
-           {
-               trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader WatchMasterNode with rc = %d\n", method_name, __LINE__, rc);
-           }
+            mon_log_write(MON_CLUSTER_ASSIGNMONITORLEADER_4, SQ_LOG_INFO, buf);
         }
 
         break;
@@ -681,12 +702,11 @@
                 {
                     if (node)
                         trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, "
-                                      "isSoftNodeDown=%d, checkProcess=%d\n"
+                                      "checkProcess=%d\n"
                                     , method_name, __LINE__
                                     , node->GetPNid()
                                     , node->GetName()
                                     , NodePhaseString(node->GetPhase())
-                                    , node->IsSoftNodeDown()
                                     , checkProcess );
                 }
                 return;
@@ -709,12 +729,11 @@
                         {
                             if (node)
                                 trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, "
-                                              "isSoftNodeDown=%d, checkProcess=%d\n"
+                                              "checkProcess=%d\n"
                                             , method_name, __LINE__
                                             , node->GetPNid()
                                             , node->GetName()
                                             , NodePhaseString(node->GetPhase())
-                                            , node->IsSoftNodeDown()
                                             , checkProcess );
                         }
                         return;
@@ -727,13 +746,13 @@
             if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
             {
                 if (node)
-                    trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, "
-                                  "isSoftNodeDown=%d, checkProcess=%d\n"
+                    trace_printf( "%s@%d - Node pnid=%d (%s), state=%s, phase=%s, "
+                                  "checkProcess=%d\n"
                                 , method_name, __LINE__
                                 , node->GetPNid()
                                 , node->GetName()
+                                , StateString(node->GetState())
                                 , NodePhaseString(node->GetPhase())
-                                , node->IsSoftNodeDown()
                                 , checkProcess );
             }
             return;
@@ -744,17 +763,23 @@
 
     if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
     {
-        trace_printf( "%s@%d" " - Node "  "%d" " TmLeader failed! (checkProcess=%d)\n"
-                    , method_name, __LINE__, tmLeaderNid_, checkProcess );
+        trace_printf( "%s@%d - Node pnid=%d (%s), state=%s, TmLeader failed! "
+                      "(tmLeaderNid_=%d, checkProcess=%d)\n"
+                    , method_name, __LINE__
+                    , node->GetPNid()
+                    , node->GetName()
+                    , StateString(node->GetState())
+                    , tmLeaderNid_
+                    , checkProcess );
     }
 
-    for (i=0; i<GetConfigPNodesMax(); i++)
+    for ( int i = 0; i < GetConfigPNodesCount(); i++)
     {
-        TmLeaderPNid++;
+        TmLeaderPNid = indexToPnid_[i];
 
-        if (TmLeaderPNid == GetConfigPNodesMax())
+        if (TmLeaderPNid == indexToPnid_[GetConfigPNodesCount()])
         {
-            TmLeaderPNid = 0; // restart with nid 0
+            TmLeaderPNid = indexToPnid_[0]; // restart with first nid
         }
 
         if (TmLeaderPNid == pnid)
@@ -771,16 +796,15 @@
 
         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
         {
-            trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, isSoftNodeDown=%d\n"
+            trace_printf( "%s@%d - Node pnid=%d (%s), state=%s, phase=%s\n"
                         , method_name, __LINE__
                         , node->GetPNid()
                         , node->GetName()
-                        , NodePhaseString(node->GetPhase())
-                        , node->IsSoftNodeDown());
+                        , StateString(node->GetState())
+                        , NodePhaseString(node->GetPhase()) );
         }
 
         if ( node->IsSpareNode() ||
-             node->IsSoftNodeDown() ||
              node->GetState() != State_Up ||
              node->GetPhase() != Phase_Ready )
         {
@@ -821,10 +845,10 @@
 #ifdef NAMESERVER_PROCESS
       ,mon2nsSock_(-1)
 #endif
-      ,epollFD_(-1),
+      ,epollFD_(-1)
+      ,epollPingFD_(-1),
       Node (NULL),
       LNode (NULL),
-      tmSyncPNid_ (-1),
       currentNodes_ (0),
       configPNodesCount_ (-1),
       configPNodesMax_ (-1),
@@ -855,8 +879,11 @@
       lowSeqNum_(0),
       highSeqNum_(0),
       reconnectSeqNum_(0),
-      seqNum_(1),
-      waitForWatchdogExit_(false)
+      seqNum_(1)
+      ,cumulativeSyncDelay_(0)
+      ,syncDelayLogEventInterval_(CCluster::SYNC_DELAY_LOGGING_FREQUENCY_DEFAULT)
+      ,syncDelayLogEventThreshold_(180)
+      ,waitForWatchdogExit_(false)
       ,waitForNameServerExit_(false)
       ,checkSeqNum_(false)
       ,validateNodeDown_(false)
@@ -983,6 +1010,37 @@
     // the maximum number that can be configured.
     recvBuffer_ = new struct sync_buffer_def[GetConfigPNodesMax()];
     recvBuffer2_ = new struct sync_buffer_def[GetConfigPNodesMax()];
+    memset( recvBuffer_, 0, sizeof(sync_buffer_def[GetConfigPNodesMax()]) );
+    memset( recvBuffer2_, 0, sizeof(sync_buffer_def[GetConfigPNodesMax()]) );
+
+    char *syncDelayLogEventIntervalC = getenv("SQ_MON_SYNC_DELAY_LOGGING_FREQUENCY");
+    if ( syncDelayLogEventIntervalC ) 
+    {
+        syncDelayLogEventInterval_ = atoi(syncDelayLogEventIntervalC);
+    }
+
+    int thresholdPercent = CCluster::SYNC_DELAY_LOGGING_THRESHOLD_MAX;
+    float threshold  = 0.5;
+    char *syncDelayLogEventThresholdC = getenv("SQ_MON_SYNC_DELAY_LOGGING_THRESHOLD");
+    if ( syncDelayLogEventThresholdC ) 
+    {
+        thresholdPercent = atoi(syncDelayLogEventThresholdC);
+        thresholdPercent = (thresholdPercent > 50) 
+            ? CCluster::SYNC_DELAY_LOGGING_THRESHOLD_MAX : thresholdPercent;
+        threshold  = (thresholdPercent/100.0);
+        syncDelayLogEventThreshold_ = (HealthCheck.getSyncTimeout() * threshold);
+    }
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf("%s@%d thresholdPercent=%d, threshold=%f, "
+                     "syncDelayLogEventThreshold_=%d, syncDelayLogEventInterval_=%d\n"
+                    , method_name, __LINE__
+                    , thresholdPercent
+                    , threshold
+                    , syncDelayLogEventThreshold_
+                    , syncDelayLogEventInterval_ );
+    }
 
     TRACE_EXIT;
 }
@@ -992,6 +1050,11 @@
     const char method_name[] = "CCluster::~CCluster";
     TRACE_ENTRY;
 
+    if (epollPingFD_ != -1)
+    {
+        close( epollPingFD_ );
+    }
+
     if (epollFD_ != -1)
     {
         close( epollFD_ );
@@ -1048,7 +1111,10 @@
     {
         if (trace_settings & TRACE_RECOVERY)
         {
-            trace_printf("%s@%d nodestate[%d].seq_num=%lld, seqNum=%lld\n", method_name, __LINE__, i, nodestate[indexToPnid_[i]].seq_num, seqNum );
+            trace_printf("%s@%d nodestate[%d].seq_num=%lld, seqNum=%lld\n"
+                        , method_name, __LINE__
+                        , indexToPnid_[i]
+                        , nodestate[indexToPnid_[i]].seq_num, seqNum );
         }
         if (nodestate[indexToPnid_[i]].seq_num > 1)
         {
@@ -1056,14 +1122,25 @@
             {
                 seqNum = nodestate[indexToPnid_[i]].seq_num;
             }
-            else
+            else if (nodestate[indexToPnid_[i]].seq_num != seqNum)
             {
-                assert(nodestate[indexToPnid_[i]].seq_num == seqNum);
+                char    buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf)
+                        , "[%s], Sync sequence number mismatch, expecting "
+                          "seqNum=%lld, my pnid=%d, nodestate[%d].seq_num=%lld\n"
+                        , method_name
+                        , seqNum, MyPNID, indexToPnid_[i]
+                        , nodestate[indexToPnid_[i]].seq_num);
+                mon_log_write(MON_CLUSTER_ENSUREANDGETSEQNUM_1, SQ_LOG_CRIT, buf);
+                mon_failure_exit();
             }
         }
         if (trace_settings & TRACE_RECOVERY)
         {
-            trace_printf("%s@%d nodestate[%d].seq_num=%lld, seqNum=%lld\n", method_name, __LINE__, i, nodestate[indexToPnid_[i]].seq_num, seqNum );
+            trace_printf("%s@%d nodestate[%d].seq_num=%lld, seqNum=%lld\n"
+                        , method_name, __LINE__
+                        , indexToPnid_[i]
+                        , nodestate[indexToPnid_[i]].seq_num, seqNum );
         }
     }
 
@@ -1136,7 +1213,7 @@
     {
         if( !IsRealCluster )
         {
-            snprintf(port_fname, sizeof(port_fname), "%s/monitor.%d.port.%s",getenv("MPI_TMPDIR"),pnid,node->GetName());
+            snprintf(port_fname, sizeof(port_fname), "%s/monitor.%d.port.%s",getenv("TRAF_LOG"),pnid,node->GetName());
         }
         else
         {
@@ -1156,7 +1233,7 @@
             {
                 strcpy (short_node_name, str1 );
             }
-            snprintf(port_fname, sizeof(port_fname), "%s/monitor.port.%s",getenv("MPI_TMPDIR"),short_node_name);
+            snprintf(port_fname, sizeof(port_fname), "%s/monitor.port.%s",getenv("TRAF_LOG"),short_node_name);
         }
         sprintf(temp_fname, "%s.bak", port_fname);
         remove(temp_fname);
@@ -1170,7 +1247,9 @@
                  node->GetName(), node->GetPNid());
         mon_log_write(MON_CLUSTER_MARKDOWN_2, SQ_LOG_CRIT, buf);
 
+        node->SetPendingNodeDown( true );
         node->SetKillingNode( true );
+        node->SetPhase( Phase_Undefined );
 
         if ( MyPNID == pnid &&
              (MyNode->GetState() == State_Up || MyNode->GetState() == State_Shutdown) &&
@@ -1189,6 +1268,10 @@
                     MyNode->setQuiesceState();
                     HealthCheck.setState(MON_NODE_QUIESCE);
                 }
+                if( IsRealCluster )
+                { // Terminate CommAccept thread, remote pings will fail
+                    CommAccept.shutdownWork();
+                }
                 break;
             default: // in all other states
                 if ( ! Emulate_Down )
@@ -1199,12 +1282,8 @@
                              "[CCluster::HardNodeDown], Node %s (%d)is down.\n",
                              node->GetName(), node->GetPNid());
                     mon_log_write(MON_CLUSTER_MARKDOWN_3, SQ_LOG_ERR, buf);
-                    // Don't generate a core file, abort is intentional
-                    struct rlimit limit;
-                    limit.rlim_cur = 0;
-                    limit.rlim_max = 0;
-                    setrlimit(RLIMIT_CORE, &limit);
-                    MPI_Abort(MPI_COMM_SELF,99);
+
+                    mon_failure_exit();
                 }
             }
         }
@@ -1218,6 +1297,7 @@
                 }
                 node->KillAllDown();
                 node->SetState( State_Down );
+                node->SetPendingNodeDown( false );
                 // Send node down message to local node's processes
                 lnode = node->GetFirstLNode();
                 for ( ; lnode; lnode = lnode->GetNextP() )
@@ -1226,24 +1306,12 @@
                 }
                 if ( ZClientEnabled )
                 {
-                    ZClient->WatchNodeDelete( node->GetName() );
-                    ZClient->WatchNodeMasterDelete( node->GetName() );
+                    ZClient->RunningZNodeDelete( node->GetName() );
+                    ZClient->MasterZNodeDelete( node->GetName() );
                 }
             }
         }
     }
-
-    // we need to abort any active TmSync
-    if (( MyNode->GetTmSyncState() == SyncState_Start    ) ||
-        ( MyNode->GetTmSyncState() == SyncState_Continue ) ||
-        ( MyNode->GetTmSyncState() == SyncState_Commit   )   )
-    {
-        MyNode->SetTmSyncState( SyncState_Abort );
-        Monitor->SetAbortPendingTmSync();
-        if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
-           trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState() ));
-    }
-
     if ( Emulate_Down )
     {
         AssignTmLeader(pnid, false);
@@ -1253,6 +1321,8 @@
         AssignLeaders(pnid, node->GetName(), false);
     }
 
+    HealthCheck.triggerTimeToLogHealth();
+
     TRACE_EXIT;
 }
 #endif
@@ -1306,8 +1376,7 @@
 
         if ( ZClientEnabled )
         {
-            //ZClient->WatchNodeDelete( node->GetName() );
-            ZClient->WatchNodeMasterDelete( node->GetName() );
+            ZClient->RunningZNodeDelete( node->GetName() );
         }
     }
 
@@ -1317,107 +1386,215 @@
 }
 #endif
 
-void CCluster::SoftNodeDown( int pnid )
+int CCluster::CheckSockPeer( int pnid, MPI_Status *stats, peer_t *peer )
 {
-    CNode  *node;
-    char    buf[MON_STRING_BUF_SIZE];
-
-    const char method_name[] = "CCluster::SoftNodeDown";
+    const char method_name[] = "CCluster::CheckSockPeer";
     TRACE_ENTRY;
 
-    node = Nodes->GetNode(pnid);
+    int err = MPI_SUCCESS;
+    CNode *node;
 
-    if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
-    {
-        trace_printf( "%s@%d - pnid=%d, state=%s, phase=%s, isInQuiesceState=%d, isSoftNodeDown=%d"
-                      " (local pnid=%d, state=%s, phase=%s, isInQuiesceState=%d, isSoftNodeDown=%d "
-                      "shutdown level=%d)\n"
-                    , method_name, __LINE__
-                    , pnid, StateString(node->GetState())
-                    , NodePhaseString(node->GetPhase())
-                    , node->isInQuiesceState()
-                    , node->IsSoftNodeDown()
-                    , MyPNID, StateString(MyNode->GetState())
-                    , NodePhaseString(MyNode->GetPhase())
-                    , MyNode->isInQuiesceState()
-                    , MyNode->IsSoftNodeDown()
-                    , MyNode->GetShutdownLevel() );
+    if( !IsRealCluster )
+    { // In virtual cluster, just return success
+        TRACE_EXIT;
+        return( err );
     }
 
-    if (( MyPNID == pnid              ) &&
-        ( MyNode->GetState() == State_Down ||
-          MyNode->IsKillingNode() ) )
+    // Release the sync lock temporarily to allow request worker thread to
+    // process any request that needs the sync lock.
+    Monitor->ExitSyncCycle();
+    pthread_yield();
+
+    node = Nodes->GetNode( pnid );
+    if (node)
     {
-        // we are coming down ... don't process it
-        return;
-    }
-
-    snprintf( buf, sizeof(buf)
-            , "[%s], Node %s (%d) is going soft down.\n"
-            , method_name, node->GetName(), node->GetPNid());
-    mon_log_write(MON_CLUSTER_SOFTNODEDOWN_1, SQ_LOG_ERR, buf);
-
-    node->SetKillingNode( true );
-
-    if ( node->GetState() == State_Up )
-    {
-        node->SetSoftNodeDown();            // Set soft down flag
-        node->SetPhase( Phase_SoftDown );   // Suspend TMSync on node
-
-        if ( node->GetPNid() == MyPNID )
+        if (node->GetState() != State_Up)
         {
-            // and tell remote monitor processes the node is soft down
-            CReplSoftNodeDown *repl = new CReplSoftNodeDown( MyPNID );
-            Replicator.addItem(repl);
+            if (socks_[pnid] != -1)
+            { // Peer socket is still active
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d - Node %s (%d) is not up, "
+                                  "removing old socket from epoll set, "
+                                  "socks_[%d]=%d\n"
+                                , method_name, __LINE__
+                                , node->GetName(), node->GetPNid()
+                                , pnid, socks_[pnid] );
+                }
+                stats[pnid].MPI_ERROR = MPI_ERR_EXITED;
+                stats[pnid].count = 0;
+                err = MPI_ERR_IN_STATUS;
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d - Setting Node %s (%d) status to "
+                                  "stats[%d].MPI_ERROR=%s\n"
+                                , method_name, __LINE__
+                                , node->GetName(), node->GetPNid()
+                                , pnid
+                                , ErrorMsg(stats[pnid].MPI_ERROR) );
+                }
+
+                --currentNodes_;
+                // Clear bit in set of "up nodes"
+                upNodes_.upNodes[pnid/MAX_NODE_BITMASK] &= ~(1ull << (pnid%MAX_NODE_BITMASK));
+    
+                // Remove old socket from epoll set, it may not be there
+                struct epoll_event event;
+                event.data.fd = socks_[pnid];
+                event.events = 0;
+                EpollCtlDelete( epollFD_, socks_[pnid], &event );
+                shutdown( socks_[pnid], SHUT_RDWR);
+                close( socks_[pnid] );
+                socks_[pnid] = -1;
+            }
         }
-
-#ifndef NAMESERVER_PROCESS
-        node->KillAllDownSoft();            // Kill all processes
-#endif
-
-        snprintf( buf, sizeof(buf)
-                , "[%s], Node %s (%d) executed soft down.\n"
-                , method_name, node->GetName(), node->GetPNid() );
-        mon_log_write(MON_CLUSTER_SOFTNODEDOWN_2, SQ_LOG_ERR, buf);
-    }
-    else
-    {
-        snprintf( buf, sizeof(buf),
-                  "[%s], Node %s (%d) soft node down not executed, state=%s\n"
-                , method_name, node->GetName()
-                , node->GetPNid()
-                , StateString(MyNode->GetState()) );
-        mon_log_write(MON_CLUSTER_SOFTNODEDOWN_3, SQ_LOG_ERR, buf);
-        // Probably a programmer bonehead!
-        abort();
+        else if ( pnid > MyPNID )
+        { // peer is above node is my node, so connect to peer
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d - Pinging node %s (%d) to see if it's up\n"
+                            , method_name, __LINE__
+                            , node->GetName(), node->GetPNid() );
+            }
+            if (PingSockPeer( node, peer->znodeFailedTime ))
+            {
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d - Node %s (%d) is available\n"
+                                , method_name, __LINE__
+                                , node->GetName(), node->GetPNid() );
+                }
+            }
+            else
+            {
+                if (node->GetState() != State_Up)
+                {
+                    if (socks_[pnid] != -1)
+                    {
+                        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                        {
+                            trace_printf( "%s@%d - Node %s (%d) is not up, "
+                                          "removing old socket from epoll set, "
+                                          "socks_[%d]=%d\n"
+                                        , method_name, __LINE__
+                                        , node->GetName(), node->GetPNid()
+                                        , pnid, socks_[pnid] );
+                        }
+    
+                        --currentNodes_;
+                        // Clear bit in set of "up nodes"
+                        upNodes_.upNodes[pnid/MAX_NODE_BITMASK] &= ~(1ull << (pnid%MAX_NODE_BITMASK));
+            
+                        // Remove old socket from epoll set, it may not be there
+                        struct epoll_event event;
+                        event.data.fd = socks_[pnid];
+                        event.events = 0;
+                        EpollCtlDelete( epollFD_, socks_[pnid], &event );
+                        shutdown( socks_[pnid], SHUT_RDWR);
+                        close( socks_[pnid] );
+                        socks_[pnid] = -1;
+                    }
+                    stats[pnid].MPI_ERROR = MPI_ERR_EXITED;
+                    stats[pnid].count = 0;
+                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                    {
+                        trace_printf( "%s@%d - Setting Node %s (%d) status to "
+                                      "stats[%d].MPI_ERROR=%s\n"
+                                    , method_name, __LINE__
+                                    , node->GetName(), node->GetPNid()
+                                    , pnid
+                                    , ErrorMsg(stats[pnid].MPI_ERROR) );
+                    }
+                }
+                err = MPI_ERR_IN_STATUS;
+            }
+        }
+        else
+        { // peer is below my node, accept connection from peer
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d - Pinging node %s (%d) to see if it's up\n"
+                            , method_name, __LINE__
+                            , node->GetName(), node->GetPNid() );
+            }
+            if (PingSockPeer( node, peer->znodeFailedTime ))
+            {
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d - Node %s (%d) is available\n"
+                                , method_name, __LINE__
+                                , node->GetName(), node->GetPNid() );
+                }
+            }
+            else
+            {
+                if (node->GetState() != State_Up)
+                {
+                    if (socks_[pnid] != -1)
+                    {
+                        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                        {
+                            trace_printf( "%s@%d - Node %s (%d) is not up, "
+                                          "removing old socket from epoll set, "
+                                          "socks_[%d]=%d\n"
+                                        , method_name, __LINE__
+                                        , node->GetName(), node->GetPNid()
+                                        , pnid, socks_[pnid] );
+                        }
+    
+                        --currentNodes_;
+                        // Clear bit in set of "up nodes"
+                        upNodes_.upNodes[pnid/MAX_NODE_BITMASK] &= ~(1ull << (pnid%MAX_NODE_BITMASK));
+            
+                        // Remove old socket from epoll set, it may not be there
+                        struct epoll_event event;
+                        event.data.fd = socks_[pnid];
+                        event.events = 0;
+                        EpollCtlDelete( epollFD_, socks_[pnid], &event );
+                        shutdown( socks_[pnid], SHUT_RDWR);
+                        close( socks_[pnid] );
+                        socks_[pnid] = -1;
+                    }
+                    stats[pnid].MPI_ERROR = MPI_ERR_EXITED;
+                    stats[pnid].count = 0;
+                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                    {
+                        trace_printf( "%s@%d - Setting Node %s (%d) status to "
+                                      "stats[%d].MPI_ERROR=%s\n"
+                                    , method_name, __LINE__
+                                    , node->GetName(), node->GetPNid()
+                                    , pnid
+                                    , ErrorMsg(stats[pnid].MPI_ERROR) );
+                    }
+                }
+                err = MPI_ERR_IN_STATUS;
+            }
+        }
+    
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            for ( int i = 0; i < GetConfigPNodesCount(); i++ )
+            {
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d" " - socks_[%d]=%d, "
+                                  "stats[%d].MPI_ERROR=%s\n"
+                                , method_name, __LINE__
+                                , indexToPnid_[i]
+                                , socks_[indexToPnid_[i]]
+                                , indexToPnid_[i]
+                                , ErrorMsg(stats[indexToPnid_[i]].MPI_ERROR) );
+                }
+            }
+            trace_printf( "%s@%d - Returning err=%d(%s)\n"
+                        , method_name, __LINE__, err, ErrorMsg(err) );
+        }
     }
 
-#ifndef NAMESERVER_PROCESS
-    // we need to abort any active TmSync
-    if (( MyNode->GetTmSyncState() == SyncState_Start    ) ||
-        ( MyNode->GetTmSyncState() == SyncState_Continue ) ||
-        ( MyNode->GetTmSyncState() == SyncState_Commit   )   )
-    {
-        MyNode->SetTmSyncState( SyncState_Abort );
-        Monitor->SetAbortPendingTmSync();
-        if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
-           trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState() ));
-    }
-#endif
-
-    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
-    {
-        trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, isSoftNodeDown=%d\n"
-                    , method_name, __LINE__
-                    , node->GetPNid()
-                    , node->GetName()
-                    , NodePhaseString(node->GetPhase())
-                    , node->IsSoftNodeDown());
-    }
-
-    AssignLeaders(pnid, node->GetName(), false);
+    Monitor->EnterSyncCycle();
 
     TRACE_EXIT;
+    return( err );
 }
 
 bool CCluster::CheckSpareSet( int pnid )
@@ -1743,18 +1920,7 @@
                 }
                 else
                 {
-                    if ( tmCount )
-                    {
-#ifndef NAMESERVER_PROCESS
-                        // Send node prepare notice to local DTM processes
-                        lnode = node->GetFirstLNode();
-                        for ( ; lnode; lnode = lnode->GetNextP() )
-                        {
-                            lnode->PrepareForTransactions( true );
-                        }
-#endif
-                    }
-                    else
+                    if ( tmCount == 0 )
                     {
                         // Process logical node up
                         lnode = node->GetFirstLNode();
@@ -1818,7 +1984,7 @@
             {
                 if ( ZClientEnabled )
                 {
-                    rc = ZClient->WatchNode( node->GetName() );
+                    rc = ZClient->RunningZNodeWatchAdd( node->GetName() );
                     if ( rc != ZOK )
                     {
                         char    buf[MON_STRING_BUF_SIZE];
@@ -1994,107 +2160,6 @@
 }
 #endif
 
-int CCluster::SoftNodeUpPrepare( int pnid )
-{
-    char    buf[MON_STRING_BUF_SIZE];
-    int     rc = MPI_SUCCESS;
-    int     tmCount = 0;
-    CNode  *node;
-    CLNode *lnode;
-    STATE   nodeState;
-
-    const char method_name[] = "CCluster::SoftNodeUpPrepare";
-    TRACE_ENTRY;
-
-    node = Nodes->GetNode( pnid );
-    if ( node == NULL )
-    {
-        if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
-           trace_printf( "%s@%d - Invalid node, pnid=%d\n"
-                       , method_name, __LINE__, pnid );
-
-        return( MPI_ERR_NAME );
-    }
-
-    nodeState = node->GetState();
-
-    if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
-       trace_printf( "%s@%d - Node name=%s, pnid=%d, state=%s, soft down=%d\n"
-                   , method_name, __LINE__
-                   , node->GetName()
-                   , node->GetPNid()
-                   , StateString( nodeState )
-                   , node->IsSoftNodeDown() );
-
-    if ( nodeState != State_Up )
-    {
-        if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
-            trace_printf( "%s@%d - Unexpectedly executing SoftNodeUp\n",
-                          method_name, __LINE__ );
-        // Programmer bonehead!
-        abort();
-    }
-
-    node->SetKillingNode( false );
-
-    node->ResetSoftNodeDown( );
-
-    node->SetPhase( Phase_Ready );
-
-    if ( MyPNID == pnid )
-    {
-        SMSIntegrating = true;
-#ifndef NAMESERVER_PROCESS
-        node->SetSoftNodeUp( );
-        Monitor->StartPrimitiveProcesses();
-#endif
-        // Let other monitors know this node is preparing to soft up
-        CReplSoftNodeUp *repl = new CReplSoftNodeUp(MyPNID);
-        Replicator.addItem(repl);
-    }
-    else
-    {
-        // Any DTMs running?
-        for ( int i=0; !tmCount && i < Nodes->GetPNodesCount(); i++ )
-        {
-            CNode  *tempNode = Nodes->GetNodeByMap( i );
-            lnode = tempNode->GetFirstLNode();
-            for ( ; lnode; lnode = lnode->GetNextP() )
-            {
-                CProcess *process = lnode->GetProcessLByType( ProcessType_DTM );
-                if ( process  ) tmCount++;
-            }
-        }
-        if ( tmCount )
-        {
-#ifndef NAMESERVER_PROCESS
-            // Send DTM restarted notice to local DTM processes
-            lnode = node->GetFirstLNode();
-            for ( ; lnode; lnode = lnode->GetNextP() )
-            {
-                lnode->SendDTMRestarted();
-            }
-#endif
-        }
-        else
-        {
-            snprintf( buf, sizeof(buf),
-                      "[%s], Node %s (%d) soft node up prepare not executed, state=%s, tmCount=%d\n"
-                    , method_name, node->GetName()
-                    , node->GetPNid()
-                    , StateString(MyNode->GetState())
-                    , tmCount );
-            mon_log_write(MON_CLUSTER_SOFTNODEUP_1, SQ_LOG_WARNING, buf);
-        }
-    }
-
-    TRACE_EXIT;
-    return( rc );
-}
-
-
-
-
 
 const char *StateString( STATE state)
 {
@@ -2174,51 +2239,6 @@
 
 
 #ifndef NAMESERVER_PROCESS
-void CCluster::AddTmsyncMsg( struct sync_buffer_def *tmSyncBuffer
-                           , struct sync_def *sync
-                           , struct internal_msg_def *msg)
-{
-    const char method_name[] = "CCluster::AddTmsyncMsg";
-    TRACE_ENTRY;
-
-    if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-        trace_printf("%s@%d - Requesting SyncType=%d\n", method_name,
-                     __LINE__, sync->type);
-
-    msg->type = InternalType_Sync;
-    msg->u.sync.type = sync->type;
-    msg->u.sync.pnid = sync->pnid;
-    msg->u.sync.syncnid = sync->syncnid;
-    msg->u.sync.tmleader = sync->tmleader;
-    msg->u.sync.state = sync->state;
-    msg->u.sync.count = sync->count;
-    if ( sync->type == SyncType_TmData )
-    {
-        memmove (msg->u.sync.data, sync->data, sync->length);
-    }
-    msg->u.sync.length = sync->length;
-
-    // We can have only have a single "InternalType_Sync" msg in our
-    // SyncBuffer, else we cause a collision.
-
-    int msgSize = (MSG_HDR_SIZE + sizeof(sync_def) - MAX_SYNC_DATA
-                   + sync->length );
-
-    // Insert the message size into the message header
-    msg->replSize = msgSize;
-    tmSyncBuffer->msgInfo.msg_count = 1;
-    tmSyncBuffer->msgInfo.msg_offset += msgSize;
-
-    // Set end-of-buffer marker
-    msg = (struct internal_msg_def *)
-        &tmSyncBuffer->msg[tmSyncBuffer->msgInfo.msg_offset];
-    msg->type = InternalType_Null;
-
-    TRACE_EXIT;
-}
-#endif
-
-#ifndef NAMESERVER_PROCESS
 void CCluster::DoDeviceReq(char * ldevName)
 {
     const char method_name[] = "CCluster::DoDeviceReq";
@@ -2351,7 +2371,7 @@
     {
     case InternalType_Null:
         if (trace_settings & TRACE_SYNC_DETAIL)
-            trace_printf("%s@%d - Node n%d has nothing to "
+            trace_printf("%s@%d - Physical Node pnid=n%d has nothing to "
                          "update. \n", method_name, __LINE__, pnid);
         break;
 
@@ -2396,6 +2416,7 @@
                                            , recv_msg->u.nameserver_delete.node_name );
         break;
 
+#ifndef NAMESERVER_PROCESS
     case InternalType_NodeAdd:
         if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
             trace_printf( "%s@%d - Internal node add request for node_name=%s, "
@@ -2418,6 +2439,7 @@
                                   , recv_msg->u.node_add.processors
                                   , recv_msg->u.node_add.roles );
         break;
+#endif
 
     case InternalType_Clone:
         if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
@@ -2443,6 +2465,7 @@
         ReqQueue.enqueueShutdownReq( recv_msg->u.shutdown.level );
         break;
 
+#ifndef NAMESERVER_PROCESS
     case InternalType_NodeDelete:
         if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
             trace_printf( "%s@%d - Internal node delete request for pnid=%d\n"
@@ -2454,6 +2477,7 @@
                                      , recv_msg->u.node_delete.req_verifier
                                      , recv_msg->u.node_delete.pnid );
         break;
+#endif
 
     case InternalType_Down:
         if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
@@ -2475,22 +2499,6 @@
                                    , recv_msg->u.node_name.new_name );
         break;
 
-    case InternalType_SoftNodeDown:
-        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
-            trace_printf("%s@%d - Internal soft node down request for pnid=%d\n", method_name, __LINE__, recv_msg->u.down.pnid);
-
-        // Queue the node down request for processing by a worker thread.
-        ReqQueue.enqueueSoftNodeDownReq( recv_msg->u.down.pnid );
-        break;
-
-    case InternalType_SoftNodeUp:
-        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
-            trace_printf("%s@%d - Internal soft node up request for pnid=%d\n", method_name, __LINE__, recv_msg->u.up.pnid);
-
-        // Queue the node up request for processing by a worker thread.
-        ReqQueue.enqueueSoftNodeUpReq( recv_msg->u.up.pnid );
-        break;
-
     case InternalType_Up:
         if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
             trace_printf("%s@%d - Internal up node request for pnid=%d\n", method_name, __LINE__, recv_msg->u.up.pnid);
@@ -2505,41 +2513,8 @@
             trace_printf("%s@%d - Internal dump request for nid=%d, pid=%d\n",
                          method_name, __LINE__,
                          recv_msg->u.dump.nid, recv_msg->u.dump.pid);
-        lnode = Nodes->GetLNode( recv_msg->u.dump.nid );
-        if ( lnode )
-        {
-            process = lnode->GetProcessL(recv_msg->u.dump.pid);
-
-            if (process)
-            {
-                int verifier = recv_msg->u.dump.verifier;
-                if ( (verifier == -1) || (verifier == process->GetVerifier()) )
-                {
-                    process->DumpBegin(recv_msg->u.dump.dumper_nid,
-                                       recv_msg->u.dump.dumper_pid,
-                                       recv_msg->u.dump.dumper_verifier,
-                                       recv_msg->u.dump.core_file);
-                }
-                else
-                {
-                    char buf[MON_STRING_BUF_SIZE];
-                    snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
-                             "pid=%d, verifier=%d for dump target.\n", method_name,
-                             recv_msg->u.dump.nid, recv_msg->u.dump.pid,
-                             recv_msg->u.dump.verifier);
-                    mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_1, SQ_LOG_ERR, buf);
-                }
-            }
-            else
-            {
-                char buf[MON_STRING_BUF_SIZE];
-                snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
-                         "pid=%d for dump target.\n", method_name,
-                         recv_msg->u.dump.nid, recv_msg->u.dump.pid);
-                mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_2, SQ_LOG_ERR, buf);
-            }
-        }
-
+        // Queue the dump request for processing by a worker thread.
+        ReqQueue.enqueueDumpReq( &recv_msg->u.dump );
         break;
 
     case InternalType_DumpComplete:
@@ -2547,38 +2522,8 @@
             trace_printf("%s@%d - Internal dump-complete request for nid=%d, pid=%d\n",
                          method_name, __LINE__,
                          recv_msg->u.dump.nid, recv_msg->u.dump.pid);
-        lnode = Nodes->GetLNode( recv_msg->u.dump.nid );
-        if ( lnode )
-        {
-            process = lnode->GetProcessL(recv_msg->u.dump.pid);
-
-            if (process)
-            {
-                int verifier = recv_msg->u.dump.verifier;
-                if ( (verifier == -1) || (verifier == process->GetVerifier()) )
-                {
-                    process->DumpEnd(recv_msg->u.dump.status, recv_msg->u.dump.core_file);
-                }
-                else
-                {
-                    char buf[MON_STRING_BUF_SIZE];
-                    snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
-                             "pid=%d, verifier=%d for dump target.\n", method_name,
-                             recv_msg->u.dump.nid, recv_msg->u.dump.pid,
-                             recv_msg->u.dump.verifier);
-                    mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_3, SQ_LOG_ERR, buf);
-                }
-            }
-            else
-            {
-                // Dump completion handled in CProcess::Exit()
-                char buf[MON_STRING_BUF_SIZE];
-                snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
-                         "pid=%d for dump complete target.\n", method_name,
-                         recv_msg->u.dump.nid, recv_msg->u.dump.pid);
-                mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_4, SQ_LOG_ERR, buf);
-            }
-        }
+        // Queue the dump complete request for processing by a worker thread.
+        ReqQueue.enqueueDumpCompleteReq( &recv_msg->u.dump );
         break;
 #endif
 
@@ -2596,47 +2541,8 @@
     case InternalType_Event:
         if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
             trace_printf("%s@%d - Internal event request\n", method_name, __LINE__);
-        if ( MyNode->IsMyNode(recv_msg->u.event.nid) )
-        {
-            if (trace_settings & TRACE_SYNC)
-                trace_printf("%s@%d - processing event for (%d, %d)\n", method_name, __LINE__, recv_msg->u.event.nid, recv_msg->u.event.pid);
-
-            lnode = Nodes->GetLNode( recv_msg->u.event.nid );
-            if ( lnode )
-            {
-                process = lnode->GetProcessL(recv_msg->u.event.pid);
-
-                if (process)
-                {
-                    int verifier = recv_msg->u.dump.verifier;
-                    if ( (verifier == -1) || (verifier == process->GetVerifier()) )
-                    {
-                        process->GenerateEvent (recv_msg->u.event.event_id,
-                                                recv_msg->u.event.length,
-                                                &recv_msg->u.event.data);
-                    }
-                    else
-                    {
-                        char buf[MON_STRING_BUF_SIZE];
-                        snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
-                                 "pid=%d, verifier=%d for event=%d\n", method_name,
-                                 recv_msg->u.event.nid, recv_msg->u.event.pid,
-                                 recv_msg->u.event.verifier, recv_msg->u.event.event_id);
-                        mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_5, SQ_LOG_ERR, buf);
-                    }
-                }
-                else
-                {
-                    char buf[MON_STRING_BUF_SIZE];
-                    snprintf(buf, sizeof(buf), "[%s], Can't find process nid"
-                             "=%d, pid=%d for processing event.\n",
-                             method_name,
-                             recv_msg->u.event.nid, recv_msg->u.event.pid);
-                    mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_6, SQ_LOG_ERR,
-                                  buf);
-                }
-            }
-        }
+        // Queue the event request for processing by a worker thread.
+        ReqQueue.enqueueEventReq( &recv_msg->u.event );
         break;
 #endif
 
@@ -2755,12 +2661,15 @@
         }
         else
         {
-            char buf[MON_STRING_BUF_SIZE];
-            snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
-                     "pid=%d for stdin data request.\n", method_name,
-                     recv_msg->u.stdin_req.nid,
-                     recv_msg->u.stdin_req.pid);
-            mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_9, SQ_LOG_DEBUG, buf);
+            if (trace_settings 
+               & (TRACE_SYNC | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL | TRACE_EVLOG_MSG))
+            {
+                trace_printf( "%s@%d - Can't find process nid=%d, "
+                              "pid=%d for stdin data request.\n"
+                            , method_name, __LINE__
+                            , recv_msg->u.stdin_req.nid
+                            , recv_msg->u.stdin_req.pid);
+            }
         }
         break;
 #endif
@@ -2824,119 +2733,6 @@
         ReqQueue.enqueueUniqStrReq( &recv_msg->u.uniqstr );
         break;
 
-#ifndef NAMESERVER_PROCESS
-    case InternalType_Sync:
-        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_TMSYNC))
-            trace_printf("%s@%d - Internal sync request for"
-                         " Node %s, pnid=%d, SyncType=%d\n",
-                         method_name, __LINE__, Node[pnid]->GetName(), pnid,
-                         recv_msg->u.sync.type);
-        switch (recv_msg->u.sync.type )
-        {
-        case SyncType_TmData:
-            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                trace_printf("%s@%d - TMSYNC(TmData) on Node %s (pnid=%d), (phase=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid, MyNode->GetPhase());
-            if ( ! MyNode->IsSpareNode() && MyNode->GetPhase() != Phase_Ready )
-            {
-                MyNode->CheckActivationPhase();
-            }
-            if ( ! MyNode->IsSpareNode() && MyNode->GetPhase() == Phase_Ready )
-            {
-                if ( MyNode->GetTmSyncState() == SyncState_Null )
-                {
-                    // Begin a Slave Sync Start
-                    if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                        trace_printf("%s@%d - Slave Sync Start on Node %s (pnid=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid);
-                    tmSyncPNid_ = pnid;
-                    Node[pnid]->SetTmSyncState( recv_msg->u.sync.state );
-                    if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                    {
-                        trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid, Node[pnid]->GetTmSyncState(), SyncStateString( Node[pnid]->GetTmSyncState() ));
-                    }
-                    Monitor->CoordinateTmDataBlock( &recv_msg->u.sync );
-                }
-                else
-                {
-                    if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                        trace_printf("%s@%d - Sync State Collision! Node %s (pnid=%d) TmSyncState=(%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState()) );
-                    if ( MyNode->GetTmSyncState() == SyncState_Continue )
-                    {
-                        if ( pnid > tmSyncPNid_ )
-                            // highest node id will continue
-                        {
-                            // They take priority ... we abort
-                            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                                trace_printf("%s@%d - Aborting Slave Sync Start on node %s (pnid=%d)\n", method_name, __LINE__, Node[Monitor->tmSyncPNid_]->GetName(), Monitor->tmSyncPNid_);
-                            MyNode->SetTmSyncState( SyncState_Null );
-                            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                                trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState() ) );
-                            Monitor->ReQueue_TmSync (false);
-                            // Continue with other node's Slave TmSync Start request
-                            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                                trace_printf("%s@%d - Slave Sync Start on node %s (pnid=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid);
-                            tmSyncPNid_ = pnid;
-                            Node[pnid]->SetTmSyncState( recv_msg->u.sync.state );
-                            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                            {
-                                trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid, Node[pnid]->GetTmSyncState(), SyncStateString( Node[pnid]->GetTmSyncState() ));
-                            }
-                            Monitor->CoordinateTmDataBlock (&recv_msg->u.sync);
-                        }
-                    }
-                    else if ( MyNode->GetTmSyncState() == SyncState_Start )
-                    {
-                        // Check if they continue with Master Sync Start
-                        if ( pnid > MyPNID )
-                            // highest node id will continue
-                        {
-                            // They take priority ... we abort
-                            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                                trace_printf("%s@%d - Aborted Master Sync Start\n", method_name, __LINE__);
-                            MyNode->SetTmSyncState( SyncState_Null );
-                            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                                trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState() ) );
-                            // Continue with other node's Slave TmSync Start request
-                            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                                trace_printf("%s@%d - Slave Sync Start on node %s (pnid=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid);
-                            tmSyncPNid_ = pnid;
-                            Node[pnid]->SetTmSyncState( recv_msg->u.sync.state );
-                            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                            {
-                                trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid, Node[pnid]->GetTmSyncState(), SyncStateString( Node[pnid]->GetTmSyncState() ));
-                            }
-                            Monitor->CoordinateTmDataBlock (&recv_msg->u.sync);
-                        }
-                        else
-                        {
-                            // We continue and assume they abort
-                            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                                trace_printf("%s@%d - Continuing with Master Sync Start\n", method_name, __LINE__);
-                        }
-                    }
-                    else
-                    {
-                        if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                            trace_printf("%s@%d - Invalid TmSync_State\n", method_name, __LINE__);
-                    }
-                }
-            }
-            break;
-
-        case SyncType_TmSyncState:
-            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                trace_printf("%s@%d - TMSYNC(TmSyncState) on Node %s (pnid=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid);
-            break;
-
-        default:
-            {
-            char buf[MON_STRING_BUF_SIZE];
-            snprintf(buf, sizeof(buf), "[%s], Unknown SyncType from pnid=%d.\n", method_name, pnid);
-            mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_10, SQ_LOG_ERR, buf);
-            }
-        }
-        break;
-#endif
-
     default:
         {
             char buf[MON_STRING_BUF_SIZE];
@@ -2954,14 +2750,13 @@
     const char method_name[] = "CCluster::HandleMyNodeMsg";
     TRACE_ENTRY;
 
+    CNode *downNode;
+    CNode *spareNode;
 #ifndef NAMESERVER_PROCESS
     CProcess *process;
     CLNode  *lnode;
 #endif
 
-    if (trace_settings & TRACE_SYNC_DETAIL)
-        trace_printf("%s@%d - Marking object as replicated, msg type=%d\n",
-                     method_name, __LINE__, recv_msg->type);
     switch (recv_msg->type)
     {
 
@@ -3004,6 +2799,7 @@
                                            , recv_msg->u.nameserver_delete.node_name );
         break;
 
+#ifndef NAMESERVER_PROCESS
     case InternalType_NodeAdd:
         if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
             trace_printf( "%s@%d - Internal node add request for node_name=%s, "
@@ -3026,6 +2822,7 @@
                                   , recv_msg->u.node_add.processors
                                   , recv_msg->u.node_add.roles );
         break;
+#endif
 
     case InternalType_Clone:
 #ifndef NAMESERVER_PROCESS
@@ -3057,6 +2854,7 @@
         ReqQueue.enqueueShutdownReq( recv_msg->u.shutdown.level );
         break;
 
+#ifndef NAMESERVER_PROCESS
     case InternalType_NodeDelete:
         if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
             trace_printf( "%s@%d - Internal node delete request for pnid=%d\n"
@@ -3068,6 +2866,7 @@
                                      , recv_msg->u.node_delete.req_verifier
                                      , recv_msg->u.node_delete.pnid );
         break;
+#endif
 
     case InternalType_Down:
         if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
@@ -3085,17 +2884,6 @@
                                    , recv_msg->u.node_name.current_name
                                    , recv_msg->u.node_name.new_name );
         break;
-
-    case InternalType_SoftNodeDown:
-        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
-            trace_printf("%s@%d - Internal soft down node request for pnid=%d\n", method_name, __LINE__, recv_msg->u.down.pnid);
-        break;
-
-    case InternalType_SoftNodeUp:
-        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
-            trace_printf("%s@%d - Internal soft up node request for pnid=%d\n", method_name, __LINE__, recv_msg->u.up.pnid);
-        break;
-
     case InternalType_Up:
         if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
             trace_printf("%s@%d - Internal up node request for pnid=%d\n", method_name, __LINE__, recv_msg->u.up.pnid);
@@ -3107,41 +2895,7 @@
             trace_printf("%s@%d - Internal dump request for nid=%d, pid=%d\n",
                          method_name, __LINE__,
                          recv_msg->u.dump.nid, recv_msg->u.dump.pid);
-
-        lnode = Nodes->GetLNode( recv_msg->u.dump.nid );
-        if ( lnode )
-        {
-            process = lnode->GetProcessL(recv_msg->u.dump.pid);
-
-            if (process)
-            {
-                int verifier = recv_msg->u.dump.verifier;
-                if ( (verifier == -1) || (verifier == process->GetVerifier()) )
-                {
-                    process->DumpBegin(recv_msg->u.dump.dumper_nid,
-                                       recv_msg->u.dump.dumper_pid,
-                                       recv_msg->u.dump.dumper_verifier,
-                                       recv_msg->u.dump.core_file);
-                }
-                else
-                {
-                    char buf[MON_STRING_BUF_SIZE];
-                    snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
-                             "pid=%d, verifier=%d for dump target.\n", method_name,
-                             recv_msg->u.dump.nid, recv_msg->u.dump.pid,
-                             recv_msg->u.dump.verifier);
-                    mon_log_write(MON_CLUSTER_HANDLEMYNODE_1, SQ_LOG_ERR, buf);
-                }
-            }
-            else
-            {
-                char buf[MON_STRING_BUF_SIZE];
-                snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
-                         "pid=%d for dump target.\n", method_name,
-                         recv_msg->u.dump.nid, recv_msg->u.dump.pid);
-                mon_log_write(MON_CLUSTER_HANDLEMYNODE_2, SQ_LOG_ERR, buf);
-            }
-        }
+        ReqQueue.enqueueDumpReq( &recv_msg->u.dump );
         break;
 
     case InternalType_DumpComplete:
@@ -3149,38 +2903,7 @@
             trace_printf("%s@%d - Internal dump-complete request for nid=%d, pid=%d\n",
                          method_name, __LINE__,
                          recv_msg->u.dump.nid, recv_msg->u.dump.pid);
-        lnode = Nodes->GetLNode( recv_msg->u.dump.nid );
-        if ( lnode )
-        {
-            process = lnode->GetProcessL(recv_msg->u.dump.pid);
-
-            if (process)
-            {
-                int verifier = recv_msg->u.dump.verifier;
-                if ( (verifier == -1) || (verifier == process->GetVerifier()) )
-                {
-                    process->DumpEnd(recv_msg->u.dump.status, recv_msg->u.dump.core_file);
-                }
-                else
-                {
-                    char buf[MON_STRING_BUF_SIZE];
-                    snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
-                             "pid=%d, verifier=%d for dump target.\n", method_name,
-                             recv_msg->u.dump.nid, recv_msg->u.dump.pid,
-                             recv_msg->u.dump.verifier);
-                    mon_log_write(MON_CLUSTER_HANDLEMYNODE_3, SQ_LOG_ERR, buf);
-                }
-            }
-            else
-            {
-                // Dump completion handled in CProcess::Exit()
-                char buf[MON_STRING_BUF_SIZE];
-                snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, "
-                         "pid=%d for dump complete target.\n", method_name,
-                         recv_msg->u.dump.nid, recv_msg->u.dump.pid);
-                mon_log_write(MON_CLUSTER_HANDLEMYNODE_4, SQ_LOG_ERR, buf);
-            }
-        }
+        ReqQueue.enqueueDumpCompleteReq( &recv_msg->u.dump );
         break;
 #endif
 
@@ -3263,50 +2986,6 @@
             trace_printf("%s@%d - Internal unique string request, completed replicating (%d, %d)\n", method_name, __LINE__, recv_msg->u.uniqstr.nid, recv_msg->u.uniqstr.id);
         break;
 
-#ifndef NAMESERVER_PROCESS
-    case InternalType_Sync:
-        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_TMSYNC))
-            trace_printf("%s@%d - Internal sync request for node %s, pnid=%d, SyncType=%d\n"
-                         , method_name, __LINE__, Node[pnid]->GetName(), pnid, recv_msg->u.sync.type);
-        switch (recv_msg->u.sync.type )
-        {
-        case SyncType_TmData:
-            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                trace_printf("%s@%d    - TMSYNC(TmData) on Node %s (pnid=%d)\n", method_name, __LINE__, Node[MyPNID]->GetName(), MyPNID);
-            tmSyncPNid_ = MyPNID;
-            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                trace_printf("%s@%d    - Sync communicated, tmSyncPNid_=%d\n", method_name, __LINE__, tmSyncPNid_);
-            if ( ! MyNode->IsSpareNode() && MyNode->GetPhase() != Phase_Ready )
-            {
-                MyNode->CheckActivationPhase();
-            }
-            if ( MyNode->GetTmSyncState() == SyncState_Start &&
-                 MyNode->GetPhase() == Phase_Ready &&
-                 MyNode->GetLNodesCount() > 1 )
-            {
-                // Begin a Slave Sync Start to other
-                // logical nodes in my physical node
-                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                    trace_printf("%s@%d - Slave Sync Start on local node %s, pnid=%d\n", method_name, __LINE__, Node[pnid]->GetName(), pnid);
-                Monitor->CoordinateTmDataBlock( &recv_msg->u.sync );
-            }
-            break;
-
-        case SyncType_TmSyncState:
-            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                trace_printf("%s@%d    - TMSYNC(TmSyncState) on Node %s (pnid=%d)\n", method_name, __LINE__, Node[MyPNID]->GetName(), MyPNID);
-            break;
-
-        default:
-            {
-                char buf[MON_STRING_BUF_SIZE];
-                snprintf(buf, sizeof(buf), "[%s], Unknown SyncType from node %s, pnid=%d during processing local SyncType.\n", method_name, Node[pnid]->GetName(), pnid);
-                mon_log_write(MON_CLUSTER_HANDLEMYNODE_5, SQ_LOG_ERR, buf);
-            }
-        }
-        break;
-#endif
-
     default:
         {
             char buf[MON_STRING_BUF_SIZE];
@@ -3326,13 +3005,16 @@
     const char method_name[] = "CCluster::responsive";
     TRACE_ENTRY;
 
+    static bool logEvent = false;
     int barrierDiff = barrierCount_ - barrierCountSaved_;
+    struct timespec currTime;
+    static struct timespec nextLogTime;
 
     // if no difference in barrier count, sync thread is not responsive
     if  ( !barrierDiff && isMonInitComplete() )
     {
         // this proc is called every SYNC_MAX_RESPONSIVE+1 secs
-        cumulativeDelaySec_ += CCluster::SYNC_MAX_RESPONSIVE + 1;
+        cumulativeSyncDelay_ += CCluster::SYNC_MAX_RESPONSIVE + 1;
 
         monSyncResponsive_ = false; // sync thread is no longer responsive
 
@@ -3341,12 +3023,15 @@
             // if sync thread is stuck in mpi call, one of the following checks will be true
             if ( inBarrier_ || inAllGather_ || inCommDup_ )
             {
-                mem_log_write(MON_CLUSTER_RESPONSIVE_1, cumulativeDelaySec_,
-                              ( ( (inBarrier_ << 1) | inAllGather_ ) << 1 ) | inCommDup_);
+                mem_log_write( MON_CLUSTER_RESPONSIVE_1
+                             , cumulativeSyncDelay_
+                             , inCommDup_   ? 4 :
+                                inAllGather_ ? 2 : 
+                                 /* inBarrier_ */ 1 );
             }
             else // non-mpi took quite long
             {
-                mem_log_write(MON_CLUSTER_RESPONSIVE_2, cumulativeDelaySec_);
+                mem_log_write(MON_CLUSTER_RESPONSIVE_2, cumulativeSyncDelay_);
             }
         }
         else
@@ -3354,25 +3039,72 @@
             // if sync thread is stuck in mpi call
             if ( inBarrier_ )
             {
-                mem_log_write(MON_CLUSTER_RESPONSIVE_1, cumulativeDelaySec_,
-                              inBarrier_);
+                mem_log_write(MON_CLUSTER_RESPONSIVE_1, cumulativeSyncDelay_, 
+                              /* inBarrier_ */ 1);
             }
             else // non-mpi took quite long
             {
-                mem_log_write(MON_CLUSTER_RESPONSIVE_2, cumulativeDelaySec_);
+                mem_log_write(MON_CLUSTER_RESPONSIVE_2, cumulativeSyncDelay_);
+            }
+            
+            if (!logEvent)
+            {
+                if (cumulativeSyncDelay_ > syncDelayLogEventThreshold_)
+                {
+                    logEvent = true;
+                    clock_gettime(CLOCK_REALTIME, &currTime);
+                    nextLogTime = currTime;
+                }
+            }
+            else
+            {
+                clock_gettime(CLOCK_REALTIME, &currTime);
+            }
+#if 0
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d - logEvent=%d, cumulativeSyncDelay_=%d, "
+                              "currTime.tv_sec=%ld(secs), nextLogTime.tv_sec=%ld(secs)\n"
+                            , method_name, __LINE__
+                            , logEvent
+                            , cumulativeSyncDelay_
+                            , currTime.tv_sec
+                            , nextLogTime.tv_sec );
+            }
+#endif
+            if (logEvent && currTime.tv_sec >= nextLogTime.tv_sec)
+            {
+                int syncTimeoutCountDown = 
+                    (HealthCheck.getSyncTimeout() - cumulativeSyncDelay_);
+                
+                nextLogTime.tv_sec = currTime.tv_sec + syncDelayLogEventInterval_;
+
+                char buf[MON_STRING_BUF_SIZE];
+                sprintf( buf
+                       , "[%s], Sync thread not responsive, Allgather() "
+                         "IO completion exceeded by %d seconds! "
+                         "('Sync Thread Timeout' will occur in approximately %d "
+                         "seconds and instance will go down "
+                         "if not resolved)\n"
+                       , method_name
+                       , cumulativeSyncDelay_
+                       , (syncTimeoutCountDown > 0) ? syncTimeoutCountDown : 0 );
+                mon_log_write(MON_CLUSTER_RESPONSIVE_3, SQ_LOG_CRIT, buf);
             }
         }
     }
     else if (barrierDiff < syncMinPerSec_)
     {
-        mem_log_write(MON_CLUSTER_RESPONSIVE_3, barrierDiff, syncMinPerSec_);
-        cumulativeDelaySec_ = 0;
+        //logEvent = false;
+        mem_log_write(MON_CLUSTER_RESPONSIVE_4, barrierDiff, syncMinPerSec_);
+        cumulativeSyncDelay_ = 0;
         monSyncResponsive_ = true; // slow but responsive
     }
     else
     {
-        cumulativeDelaySec_ = 0;
-        monSyncResponsive_ = true; // truely responsive
+        logEvent = false;
+        cumulativeSyncDelay_ = 0;
+        monSyncResponsive_ = true; // truly responsive
     }
 
     barrierCountSaved_ = barrierCount_;
@@ -3417,9 +3149,11 @@
     // Update node membership in the cluster
 
     if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
+    {
         trace_printf( "%s@%d - Configured physical nodes count=%d\n"
                     , method_name, __LINE__
                     , GetConfigPNodesCount() );
+    }
 
     if (nodeAdded)
     {
@@ -3441,15 +3175,24 @@
 
     if ( rs )
     {
+        if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
+        {
+            trace_printf( "%s@%d - Updating cluster configuration, physical nodes count=%d, rs=%d\n"
+                        , method_name, __LINE__
+                        , GetConfigPNodesCount(), rs );
+        }
+
         CClusterConfig *clusterConfig = Nodes->GetClusterConfig();
         configPNodesCount_ = clusterConfig->GetPNodesCount();
         Nodes->UpdateCluster();
     }
 
     if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
-        trace_printf( "%s@%d - Configured physical nodes count=%d\n"
+    {
+        trace_printf( "%s@%d - Configured physical nodes count=%d, rs=%d\n"
                     , method_name, __LINE__
-                    , GetConfigPNodesCount() );
+                    , GetConfigPNodesCount(), rs );
+    }
 
     TRACE_EXIT;
     return( rs );
@@ -3464,6 +3207,12 @@
     const char method_name[] = "CCluster::InitializeConfigCluster";
     TRACE_ENTRY;
 
+    if (trace_settings & TRACE_INIT)
+    {
+        trace_printf( "%s@%d (MasterMonitor) Node_name=%s, MyPNID=%d\n"
+                    , method_name, __LINE__, Node_name, MyPNID );
+    }
+
     int worldSize = 0;
     MPI_Comm_size (MPI_COMM_WORLD, &worldSize);
 #ifdef NAMESERVER_PROCESS
@@ -3527,6 +3276,13 @@
         if (MyPNID == -1)
         {
             MyPNID = clusterConfig->GetPNid( Node_name );
+
+            if (trace_settings & TRACE_INIT)
+            {
+                trace_printf( "%s@%d (MasterMonitor) Node_name=%s, MyPNID=%d\n"
+                            , method_name, __LINE__, Node_name, MyPNID );
+            }
+
             if (MyPNID == -1)
             {
                 char buf[MON_STRING_BUF_SIZE];
@@ -3534,7 +3290,7 @@
                          method_name, __LINE__, Node_name );
                 mon_log_write(MON_CLUSTER_INITCONFIGCLUSTER_1, SQ_LOG_CRIT, buf);
 
-                MPI_Abort(MPI_COMM_SELF,99);
+                mon_failure_exit();
             }
         }
     }
@@ -3550,12 +3306,13 @@
 
     if (trace_settings & TRACE_INIT)
     {
-        trace_printf( "%s@%d (MasterMonitor) IAmIntegrating=%d,"
-                      " IsAgentMode=%d, IsMaster=%d,"
-                      " MasterMonitorName=%s, Node_name=%s\n"
+        trace_printf( "%s@%d (MasterMonitor) IAmIntegrating=%s,"
+                      " IsAgentMode=%s, IsMaster=%s, MasterMonitorName=%s,"
+                      " Node_name=%s, MyNode Name=%s, MyPNID=%d\n"
                     , method_name, __LINE__
-                    , IAmIntegrating
-                    , IsAgentMode, IsMaster, MasterMonitorName, Node_name );
+                    , IAmIntegrating?"TRUE":"FALSE"
+                    , IsAgentMode?"TRUE":"FALSE", IsMaster?"TRUE":"FALSE"
+                    , MasterMonitorName, Node_name, MyNode->GetName(), MyPNID );
     }
 
     if (IAmIntegrating || IsAgentMode)
@@ -3634,7 +3391,7 @@
                          method_name, __LINE__, ErrorMsg(rc));
                 mon_log_write(MON_CLUSTER_INITCONFIGCLUSTER_2, SQ_LOG_CRIT, buf);
 
-                MPI_Abort(MPI_COMM_SELF,99);
+                mon_failure_exit();
             }
 
             // Collect sync port info from other monitors
@@ -3648,7 +3405,7 @@
                          method_name, __LINE__, ErrorMsg(rc));
                 mon_log_write(MON_CLUSTER_INITCONFIGCLUSTER_2, SQ_LOG_CRIT, buf);
 
-                MPI_Abort(MPI_COMM_SELF,99);
+                mon_failure_exit();
             }
 
             // Exchange Node Names with collective
@@ -3663,7 +3420,7 @@
                          method_name, __LINE__, ErrorMsg(rc));
                 mon_log_write(MON_CLUSTER_INITCONFIGCLUSTER_3, SQ_LOG_CRIT, buf);
 
-                MPI_Abort(MPI_COMM_SELF,99);
+                mon_failure_exit();
             }
 
             // For each node name received get corresponding CNode object and
@@ -3816,6 +3573,33 @@
     TRACE_EXIT;
 }
 
+void CCluster::InitializeConfigCluster( int pnid )
+{
+    const char method_name[] = "CCluster::InitializeConfigCluster";
+    TRACE_ENTRY;
+
+    Nodes->AddLNodes();
+
+    // Set bit indicating node is up
+    upNodes_.upNodes[pnid/MAX_NODE_BITMASK] |= 
+        (1ull << (pnid%MAX_NODE_BITMASK));
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        for ( int i=0; i < MAX_NODE_MASKS ; i++ )
+        {
+            trace_printf( "%s@%d upNodes set[%d]: %llx\n"
+                        , method_name, __LINE__
+                        , i, upNodes_.upNodes[i]);
+        }
+    }
+
+    // Refresh the pnid and nid maps
+    Nodes->UpdateCluster();
+
+    TRACE_EXIT;
+}
+
 void CCluster::InitClusterComm(int worldSize, int myRank, int * rankToPnid)
 {
     const char method_name[] = "CCluster::InitClusterComm";
@@ -4052,7 +3836,9 @@
     mon_log_write(MON_CLUSTER_REINTEGRATE_1, SQ_LOG_ERR, buf);
 
     if ( abortIn )
-        MPI_Abort(MPI_COMM_SELF,99);
+    {
+        abort();
+    }
 
     TRACE_EXIT;
 }
@@ -4060,11 +3846,21 @@
 void CCluster::SendReIntegrateStatus( STATE nodeState, int initErr )
 {
     const char method_name[] = "CCluster::SendReIntegrateStatus";
+    TRACE_ENTRY;
+
     int rc;
     nodeStatus_t nodeStatus;
     nodeStatus.state = nodeState;
     nodeStatus.status = initErr;
 
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d - Sending reintegrate status: state=%s, error=%d\n"
+                    , method_name, __LINE__
+                    , StateString(nodeStatus.state)
+                    , nodeStatus.status );
+    }
+
     switch( CommType )
     {
         case CommType_InfiniBand:
@@ -4075,7 +3871,7 @@
                                  , joinComm_ );
             if ( rc )
             {
-                HandleReintegrateError( rc, Reintegrate_Err8, -1, NULL, true );
+                HandleReintegrateError( rc, Reintegrate_Err8, -1, NULL );
             }
             break;
         case CommType_Sockets:
@@ -4085,7 +3881,7 @@
                                   , method_name );
             if ( rc )
             {
-                HandleReintegrateError( rc, Reintegrate_Err8, -1, NULL, true );
+                HandleReintegrateError( rc, Reintegrate_Err8, -1, NULL );
             }
             break;
         default:
@@ -4094,14 +3890,16 @@
     }
 
     if ( nodeState != State_Up )
-    {  // Initialization error, abort.
+    {  // Initialization error
 
         mem_log_write(CMonLog::MON_REINTEGRATE_9, MyPNID, initErr);
-        HandleReintegrateError( rc, initErr, -1, NULL, true );
+        HandleReintegrateError( rc, initErr, -1, NULL );
     }
+
+    TRACE_EXIT;
 }
 
-bool CCluster::PingSockPeer(CNode *node)
+bool CCluster::PingSockPeer( CNode *node, struct timespec &peerZnodeFailTime )
 {
     const char method_name[] = "CCluster::PingSockPeer";
     TRACE_ENTRY;
@@ -4132,56 +3930,100 @@
             sv_connect_wait_timeout = 16;
             sv_connect_retry_count = 4;
         }
-
-        char buf[MON_STRING_BUF_SIZE];
-        snprintf( buf, sizeof(buf)
-                , "[%s@%d] Ping connect timeout wait_timeout=1 second, retry_count=%d\n"
-                , method_name
-                ,  __LINE__
-                , (sv_connect_retry_count * sv_connect_wait_timeout) );
-
-        mon_log_write( MON_PINGSOCKPEER_3, SQ_LOG_INFO, buf );
     }
 
-    bool rs = true;
-    int  rc;
+    bool createErrorZNode = true;
     int  pingSock = -1;
+    struct timespec currentTime;
 
-    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    if (MyNode->IsPendingNodeDown())
     {
-        trace_printf( "%s@%d - Pinging remote monitor %s, pnid=%d\n"
-                    , method_name, __LINE__
-                    , node->GetName(), node->GetPNid() );
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d - MyNode %s (%d) is going down, "
+                          "socks_[%d]=%d, state=%s, pendingNodeDown=%d\n"
+                        , method_name, __LINE__
+                        , MyNode->GetName(), MyNode->GetPNid()
+                        , MyNode->GetPNid(), socks_[MyNode->GetPNid()]
+                        , StateString(MyNode->GetState())
+                        , MyNode->IsPendingNodeDown() );
+        }
+        return( false );
     }
 
+    char buf[MON_STRING_BUF_SIZE];
+    snprintf( buf, sizeof(buf)
+            , "[%s@%d] Pinging remote monitor %s, pnid=%d\n"
+            , method_name,  __LINE__
+            , node->GetName(), node->GetPNid() );
+
+    mon_log_write( MON_PINGSOCKPEER_1, SQ_LOG_INFO, buf );
+
     // Attempt to connect with remote monitor in one seconds increments
     // to recover as quickly as possible or give up trying
     for (int i = 0; i < (sv_connect_retry_count*sv_connect_wait_timeout); i++ )
     {
-        // Disable internal retries
+        // Disable connect internal retries
         pingSock = Monitor->Connect( node->GetCommPort(), false );
         if ( pingSock < 0 )
         {
-            if (node->GetState() != State_Up)
+            clock_gettime(CLOCK_REALTIME, &currentTime);
+            if (node->GetState() != State_Up || node->IsPendingNodeDown())
             {
                 if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
                 {
                     trace_printf( "%s@%d - Node %s (%d) is not up, "
-                                  "socks_[%d]=%d\n"
+                                  "socks_[%d]=%d, state=%s, pendingNodeDown=%d\n"
                                 , method_name, __LINE__
                                 , node->GetName(), node->GetPNid()
-                                , node->GetPNid(), socks_[node->GetPNid()] );
+                                , node->GetPNid(), socks_[node->GetPNid()]
+                                , StateString(node->GetState())
+                                , node->IsPendingNodeDown() );
                 }
                 break;
             }
-            char buf[MON_STRING_BUF_SIZE];
-            snprintf( buf, sizeof(buf)
-                    , "[%s@%d] Retrying connect to remote monitor %s, pnid=%d, retry=%d\n"
-                    , method_name
-                    ,  __LINE__
-                    , node->GetName(), node->GetPNid(), i );
-            mon_log_write( MON_PINGSOCKPEER_4, SQ_LOG_INFO, buf );
-            sleep( 1 );
+            else if (currentTime.tv_sec > peerZnodeFailTime.tv_sec)
+            {
+                char buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf)
+                        , "[%s@%d] Connect exceeded session timeout to remote "
+                          "monitor %s, pnid=%d, retry=%d, "
+                          "currentTime=%ld(secs), peerZnodeFailTime=%ld(secs)\n"
+                        , method_name,  __LINE__
+                        , node->GetName(), node->GetPNid(), i 
+                        , currentTime.tv_sec
+                        , peerZnodeFailTime.tv_sec );
+                mon_log_write( MON_PINGSOCKPEER_2, SQ_LOG_WARNING, buf );
+                if (ZClientEnabled && createErrorZNode)
+                {
+                    if (node->GetState() == State_Up)
+                    {
+                        ZClient->ErrorZNodeCreate( node->GetName() );
+                    }
+                    else
+                    {
+                        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                        {
+                            trace_printf( "%s@%d Node %s is not up, state=%s\n"
+                                        , method_name, __LINE__
+                                        , node->GetName()
+                                        , StateString(node->GetState()) );
+                        }
+                    }
+                    createErrorZNode = false;
+                }
+                break;
+            }
+            else
+            {
+                char buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf)
+                        , "[%s@%d] Retrying connect to remote monitor %s, pnid=%d, retry=%d\n"
+                        , method_name,  __LINE__
+                        , node->GetName(), node->GetPNid(), (i+1) );
+                mon_log_write( MON_PINGSOCKPEER_3, SQ_LOG_INFO, buf );
+                sleep( 1 );
+            }
         }
         else
         {
@@ -4199,6 +4041,7 @@
         return(false);
     }
 
+    int rc = MPI_SUCCESS;
     nodeId_t nodeInfo;
 
     nodeInfo.pnid = MyPNID;
@@ -4213,7 +4056,7 @@
 
     if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
     {
-        trace_printf( "Sending my nodeInfo.pnid=%d\n"
+        trace_printf( "%s@%d - Sending my nodeInfo.pnid=%d\n"
                       "        nodeInfo.nodeName=%s\n"
                       "        nodeInfo.commPort=%s\n"
                       "        nodeInfo.syncPort=%s\n"
@@ -4222,6 +4065,7 @@
                       "        nodeInfo.creatorShellPid=%d\n"
                       "        nodeInfo.creatorShellVerifier=%d\n"
                       "        nodeInfo.ping=%d\n"
+                    , method_name, __LINE__
                     , nodeInfo.pnid
                     , nodeInfo.nodeName
                     , nodeInfo.commPort
@@ -4233,45 +4077,71 @@
                     , nodeInfo.ping );
     }
 
-    rc = Monitor->SendSock( (char *) &nodeInfo
-                          , sizeof(nodeId_t)
-                          , pingSock
-                          , method_name );
-
+    rc = SendSock( (char *) &nodeInfo
+                 , sizeof(nodeId_t)
+                 , pingSock
+                 , method_name );
     if ( rc )
     {
-        rs = false;
+        shutdown( pingSock, SHUT_RDWR);
+        close( (int)pingSock );
+
         char buf[MON_STRING_BUF_SIZE];
         snprintf( buf, sizeof(buf)
-                , "[%s], Cannot send ping node info to node %s: (%s)\n"
-                , method_name, node->GetName(), ErrorMsg(rc));
-        mon_log_write(MON_PINGSOCKPEER_1, SQ_LOG_ERR, buf);
+                , "[%s], Cannot send my node info to node %s: (%s)\n"
+                , method_name
+                , node?node->GetName():"", ErrorMsg(rc));
+        mon_log_write(MON_PINGSOCKPEER_4, SQ_LOG_ERR, buf);
+        return(false);
     }
     else
     {
         // Get info about connecting monitor
-        rc = Monitor->ReceiveSock( (char *) &nodeInfo
-                                 , sizeof(nodeId_t)
-                                 , pingSock
-                                 , method_name );
+        rc = ReceiveSock( (char *) &nodeInfo
+                        , sizeof(nodeId_t)
+                        , pingSock
+                        , method_name );
         if ( rc )
         {   // Handle error
-            rs = false;
+            shutdown( pingSock, SHUT_RDWR);
+            close( (int)pingSock );
+
             char buf[MON_STRING_BUF_SIZE];
             snprintf( buf, sizeof(buf)
-                    , "[%s], Cannot receive ping node info from node %s: (%s)\n"
-                    , method_name, node->GetName(), ErrorMsg(rc));
-            mon_log_write(MON_PINGSOCKPEER_2, SQ_LOG_ERR, buf);
+                    , "[%s], unable to obtain node sync info from remote"
+                      "monitor: %s.\n"
+                    , method_name, ErrorMsg(rc));
+            mon_log_write(MON_PINGSOCKPEER_5, SQ_LOG_ERR, buf);    
+            return(false);
         }
         else
         {
+            if (ZClientEnabled)
+            {
+                int zerr;
+                if ( ZClient->IsRunningZNodeExpired( node->GetName(), zerr ) )
+                {   // Handle znode expiration
+                    shutdown( pingSock, SHUT_RDWR);
+                    close( (int)pingSock );
+        
+                    char buf[MON_STRING_BUF_SIZE];
+                    snprintf( buf, sizeof(buf)
+                            , "[%s], Ping successful, but znode expired on "
+                              "node: %s.\n"
+                            , method_name, zerror(zerr));
+                    mon_log_write(MON_PINGSOCKPEER_6, SQ_LOG_ERR, buf);    
+                    return(false);
+                }
+            }
+        
             if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
             {
-                trace_printf( "Received from nodeInfo.pnid=%d\n"
+                trace_printf( "%s@%d - Received from nodeInfo.pnid=%d\n"
                               "        nodeInfo.nodeName=%s\n"
                               "        nodeInfo.commPort=%s\n"
                               "        nodeInfo.syncPort=%s\n"
                               "        nodeInfo.ping=%d\n"
+                            , method_name, __LINE__
                             , nodeInfo.pnid
                             , nodeInfo.nodeName
                             , nodeInfo.commPort
@@ -4281,10 +4151,24 @@
         }
     }
 
+    shutdown( pingSock, SHUT_RDWR);
     close( pingSock );
 
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d - Ping success to remote monitor %s, pnid=%d\n"
+                    , method_name, __LINE__
+                    , node->GetName(), node->GetPNid() );
+    }
+
+    if (ZClientEnabled)
+    {
+        // Clean up error znodes and where I am their 'only' child
+        ZClient->HandleErrorChildZNodesForZNodeChild( Node_name, false );
+    }
+
     TRACE_EXIT;
-    return( rs );
+    return( true );
 }
 
 void CCluster::ReIntegrate( int initProblem )
@@ -4349,7 +4233,7 @@
                            MPI_INFO_NULL, 0, MPI_COMM_SELF, &joinComm_ );
     if ( rc )
     {
-        HandleReintegrateError( rc, Reintegrate_Err1, -1, NULL, true );
+        HandleReintegrateError( rc, Reintegrate_Err1, -1, NULL );
     }
 
     MPI_Comm_set_errhandler( joinComm_, MPI_ERRORS_RETURN );
@@ -4371,15 +4255,14 @@
     myNodeInfo.creatorShellVerifier = CreatorShellVerifier;
     if ((rc = Monitor->SendMPI((char *) &myNodeInfo, sizeof(nodeId_t), 0,
                             MON_XCHNG_DATA, joinComm_)))
-        HandleReintegrateError( rc, Reintegrate_Err9, -1, NULL,
-                                true );
+        HandleReintegrateError( rc, Reintegrate_Err9, -1, NULL );
 
     TEST_POINT( TP012_NODE_UP );
 
     // Merge the inter-communicators obtained from the connect/accept
     // between this new monitor and the creator monitor.
     if ((rc = MPI_Intercomm_merge( joinComm_, 1, &intraCommCreatorMon )))
-        HandleReintegrateError( rc, Reintegrate_Err2, -1, NULL, true );
+        HandleReintegrateError( rc, Reintegrate_Err2, -1, NULL );
 
     MPI_Comm_set_errhandler( intraCommCreatorMon, MPI_ERRORS_RETURN );
 
@@ -4391,12 +4274,12 @@
     // the creator monitor.
     if ((rc = Monitor->ReceiveMPI((char *)nodeInfo, sizeof(nodeId_t)*GetConfigPNodesCount(),
                                MPI_ANY_SOURCE, MON_XCHNG_DATA, joinComm_)))
-        HandleReintegrateError( rc, Reintegrate_Err3, -1, NULL, true );
+        HandleReintegrateError( rc, Reintegrate_Err3, -1, NULL );
 
     if ( initProblem )
     {
         // The monitor encountered an initialization error.  Inform
-        // the creator monitor that the node is down.  Then abort.
+        // the creator monitor that the node is down.  Then exit.
         SendReIntegrateStatus( State_Down, initProblem );
     }
 
@@ -4445,8 +4328,7 @@
                                         MPI_INFO_NULL, 0, MPI_COMM_SELF,
                                         &interComm )))
             {
-                HandleReintegrateError( rc, Reintegrate_Err5, i, &nodeInfo[i],
-                                        false );
+                HandleReintegrateError( rc, Reintegrate_Err5, i, &nodeInfo[i] );
                 SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
             }
 
@@ -4465,14 +4347,13 @@
             if ((rc = Monitor->SendMPI((char *) &myNodeInfo, sizeof(nodeId_t), 0,
                                     MON_XCHNG_DATA, interComm)))
             {
-                HandleReintegrateError( rc, Reintegrate_Err4, i, &nodeInfo[i],
-                                        false );
+                HandleReintegrateError( rc, Reintegrate_Err4, i, &nodeInfo[i] );
                 SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
             }
 
             if ((rc = MPI_Intercomm_merge(interComm, 1, &intraComm)))
             {
-                HandleReintegrateError( rc, Reintegrate_Err6, i, NULL, false );
+                HandleReintegrateError( rc, Reintegrate_Err6, i, NULL );
                 SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
             }
 
@@ -4486,8 +4367,7 @@
                                        MPI_ANY_SOURCE, MON_XCHNG_DATA,
                                        interComm)))
             {
-                HandleReintegrateError( rc, Reintegrate_Err15, i, NULL,
-                                        false );
+                HandleReintegrateError( rc, Reintegrate_Err15, i, NULL );
                 SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
             }
 
@@ -4500,7 +4380,7 @@
             }
 
             if ((rc = MPI_Comm_disconnect(&interComm)))
-                HandleReintegrateError( rc, Reintegrate_Err7, i, NULL, false );
+                HandleReintegrateError( rc, Reintegrate_Err7, i, NULL );
 
             MPI_Comm_set_errhandler(intraComm, MPI_ERRORS_RETURN);
 
@@ -4610,7 +4490,7 @@
             }
             else
             {
-                HandleReintegrateError( joinSock_, Reintegrate_Err1, -1, NULL, true );
+                HandleReintegrateError( joinSock_, Reintegrate_Err1, -1, NULL );
             }
         }
         else
@@ -4666,7 +4546,7 @@
                           , method_name );
     if ( rc )
     {
-        HandleReintegrateError( rc, Reintegrate_Err9, -1, NULL, true );
+        HandleReintegrateError( rc, Reintegrate_Err9, -1, NULL );
     }
 
     TEST_POINT( TP012_NODE_UP );
@@ -4691,13 +4571,13 @@
                              , method_name );
     if ( rc )
     {
-        HandleReintegrateError( rc, Reintegrate_Err3, -1, NULL, true );
+        HandleReintegrateError( rc, Reintegrate_Err3, -1, NULL );
     }
 
     if ( initProblem )
     {
         // The monitor encountered an initialization error.  Inform
-        // the creator monitor that the node is down.  Then abort.
+        // the creator monitor that the node is down.  Then exit.
         SendReIntegrateStatus( State_Down, initProblem );
     }
 
@@ -4740,8 +4620,7 @@
                                      , method_name );
             if ( rc || creatorpnid != nodeInfo[i].creatorPNid )
             {
-                HandleReintegrateError( rc, Reintegrate_Err15, i, NULL,
-                                        false );
+                HandleReintegrateError( rc, Reintegrate_Err15, i, NULL );
                 SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
             }
 
@@ -4790,8 +4669,7 @@
                                   , method_name );
             if ( rc )
             {
-                HandleReintegrateError( rc, Reintegrate_Err4, i, &nodeInfo[i],
-                                        false );
+                HandleReintegrateError( rc, Reintegrate_Err4, i, &nodeInfo[i] );
                 SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
             }
 
@@ -4799,8 +4677,7 @@
             existingSyncFd = AcceptSyncSock();
             if ( existingSyncFd < 0 )
             {
-                HandleReintegrateError( rc, Reintegrate_Err5, i, &nodeInfo[i],
-                                        false );
+                HandleReintegrateError( rc, Reintegrate_Err5, i, &nodeInfo[i] );
                 SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
             }
             socks_[nodeInfo[i].pnid] = existingSyncFd; // ReIntegrateSock
@@ -4851,8 +4728,7 @@
             existingCommFd = Monitor->Connect( nodeInfo[i].commPort );
             if ( existingCommFd < 0 )
             {
-                HandleReintegrateError( rc, Reintegrate_Err5, i, &nodeInfo[i],
-                                        false );
+                HandleReintegrateError( rc, Reintegrate_Err5, i, &nodeInfo[i] );
                 SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
             }
 
@@ -4872,8 +4748,7 @@
                                   , method_name );
             if ( rc )
             {
-                HandleReintegrateError( rc, Reintegrate_Err4, i, &nodeInfo[i],
-                                        false );
+                HandleReintegrateError( rc, Reintegrate_Err4, i, &nodeInfo[i] );
                 SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
             }
 
@@ -4889,8 +4764,7 @@
                                      , method_name );
             if ( rc || remotepnid != nodeInfo[i].pnid )
             {
-                HandleReintegrateError( rc, Reintegrate_Err15, i, NULL,
-                                        false );
+                HandleReintegrateError( rc, Reintegrate_Err15, i, NULL );
                 SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
             }
 
@@ -4936,8 +4810,7 @@
             existingSyncFd = AcceptSyncSock();
             if ( existingSyncFd < 0 )
             {
-                HandleReintegrateError( rc, Reintegrate_Err5, i, &nodeInfo[i],
-                                        false );
+                HandleReintegrateError( rc, Reintegrate_Err5, i, &nodeInfo[i] );
                 SendReIntegrateStatus( State_Down, Reintegrate_Err14 );
             }
             socks_[nodeInfo[i].pnid] = existingSyncFd; // ReIntegrateSock
@@ -5357,7 +5230,7 @@
             break;
         default:
             // Programmer bonehead!
-            MPI_Abort(MPI_COMM_SELF,99);
+            abort();
     }
 
     TRACE_EXIT;
@@ -5502,12 +5375,14 @@
     bool reconnecting = false;
     static int hdrSize = Nodes->GetSyncHdrSize( );
     int err = MPI_SUCCESS;
+    int lastReconnectErr = MPI_ERR_IN_STATUS;
     peer_t p[GetConfigPNodesMax()];
     memset( p, 0, sizeof(p) );
     tag = tag; // make compiler happy
+    struct timespec currentTime;
     // Set to twice the ZClient session timeout
     static int sessionTimeout = ZClientEnabled
-                                ? (ZClient->GetSessionTimeout() * 2) : 120;
+                                ? (ZClient->SessionTimeoutGet() * 2) : 120;
 
     int nsent = 0, nrecv = 0;
     for ( int iPeer = 0; iPeer < GetConfigPNodesCount(); iPeer++ )
@@ -5573,6 +5448,12 @@
             {
                 sv_epoll_retry_count = atoi( lv_epoll_retry_count_env );
             }
+            else
+            {
+                // default to 64 seconds
+                sv_epoll_wait_timeout = 16000;
+                sv_epoll_retry_count = 4;
+            }
             if ( sv_epoll_retry_count > 180 )
             {
                 sv_epoll_retry_count = 180;
@@ -5596,6 +5477,9 @@
         mon_log_write( MON_CLUSTER_ALLGATHERSOCK_1, SQ_LOG_INFO, buf );
     }
 
+    bool resetConnections = false;
+    int peerTimedoutCount = 0;
+
     // do the work
     struct epoll_event events[2*GetConfigPNodesMax() + 1];
     while ( 1 )
@@ -5603,8 +5487,7 @@
 reconnected:
         bool checkConnections = false;
         bool doReconnect = false;
-        bool resetConnections = false;
-        int peerTimedoutCount = 0;
+        int numPeersTimedout = 0;
         int maxEvents = 2*GetConfigPNodesCount() - nsent - nrecv;
         if ( maxEvents == 0 ) break;
         int nw;
@@ -5618,27 +5501,19 @@
 
         if ( nw == 0 )
         { // Timeout, no fd's ready
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d" " - IO timeout! (seqNum_=%lld)\n"
+                            , method_name, __LINE__, seqNum_ );
+            }
+        
+            peerTimedoutCount++;
+            clock_gettime(CLOCK_REALTIME, &currentTime);
             for ( int iPeer = 0; iPeer < GetConfigPNodesCount(); iPeer++ )
             { // Check no IO completion on peers
                 peer = &p[indexToPnid_[iPeer]];
                 if ( (peer->p_receiving) || (peer->p_sending) )
                 {
-                    peerTimedoutCount++;
-                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-                    {
-                        trace_printf( "%s@%d - EPOLL timeout (%d) on: %s(%d), "
-                                      "socks_[%d]=%d, "
-                                      "peer->p_sending=%d, "
-                                      "peer->p_receiving=%d\n"
-                                    , method_name, __LINE__
-                                    , peerTimedoutCount
-                                    , Node[indexToPnid_[iPeer]]->GetName(), indexToPnid_[iPeer]
-                                    , indexToPnid_[iPeer]
-                                    , socks_[indexToPnid_[iPeer]]
-                                    , peer->p_sending
-                                    , peer->p_receiving );
-                    }
-
                     if (peer->p_initial_check && !reconnecting)
                     { // Set the session timeout relative to now
                         peer->p_initial_check = false;
@@ -5652,27 +5527,49 @@
                         }
                     }
 
-                    if ( IsRealCluster && peer->p_timeout_count < sv_epoll_retry_count )
+                    numPeersTimedout++;
+                    peer->p_timeout_count++;
+                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
                     {
-                        peer->p_timeout_count++;
+                        trace_printf( "%s@%d - EPOLL timeout (peer->p_timeout_count=%d) on: %s(%d), "
+                                      "socks_[%d]=%d, "
+                                      "peer->p_sending=%d, "
+                                      "peer->p_receiving=%d\n"
+                                    , method_name, __LINE__
+                                    , peer->p_timeout_count
+                                    , Node[indexToPnid_[iPeer]]->GetName(), indexToPnid_[iPeer]
+                                    , indexToPnid_[iPeer]
+                                    , socks_[indexToPnid_[iPeer]]
+                                    , peer->p_sending
+                                    , peer->p_receiving );
+                    }
+
+                    if (IsRealCluster && peer->p_timeout_count)
+                    {
                         checkConnections = true;
-                        if (peer->p_timeout_count == sv_epoll_retry_count)
+                        if (lastReconnectErr == MPI_SUCCESS
+                         && peer->p_timeout_count)
                         {
                             resetConnections = true;
                         }
                     }
                     else
                     {
+                        checkConnections = (IsRealCluster) ? true : false;
                         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
                         {
                             trace_printf( "%s@%d" " - Peer timed out: %s(%d), "
                                           "socks_[%d]=%d, "
-                                          "peer->p_timeout_count=%d\n"
+                                          "peer->p_timeout_count=%d, "
+                                          "peer->znodeFailedTime=%ld(secs), "
+                                          "currentTime=%ld(secs)\n"
                                         , method_name, __LINE__
                                         , Node[indexToPnid_[iPeer]]->GetName(), indexToPnid_[iPeer]
                                         , indexToPnid_[iPeer]
                                         , socks_[indexToPnid_[iPeer]]
-                                        , peer->p_timeout_count );
+                                        , peer->p_timeout_count
+                                        , peer->znodeFailedTime.tv_sec
+                                        , currentTime.tv_sec );
                         }
                     }
                 }
@@ -5683,10 +5580,14 @@
                 checkConnections = false;
                 if (trace_settings & TRACE_RECOVERY)
                 {
-                    trace_printf( "%s@%d - Initializing AllgatherSockReconnect(),"
-                                  " peerTimedoutCount=%d\n"
+                    trace_printf( "%s@%d - Initiating AllgatherSockReconnect(), "
+                                  "peerTimedoutCount=%d, numPeersTimedout=%d, "
+                                  "resetConnections=%d, lastReconnectErr=%d\n"
                                 , method_name, __LINE__
-                                , peerTimedoutCount );
+                                , peerTimedoutCount
+                                , numPeersTimedout
+                                , resetConnections
+                                , lastReconnectErr );
                 }
                 // First, check ability to connect to all peers
                 // An err returned will mean that connect failed with
@@ -5694,13 +5595,13 @@
                 // reset occurred and there is probably one dead connection
                 // to a peer where no IOs will complete ever, so connections
                 // to all peers must be reestablished.
-                err = AllgatherSockReconnect( stats, false );
+                lastReconnectErr = err = AllgatherSockReconnect( stats, p, resetConnections );
                 if (err == MPI_SUCCESS)
                 { // Connections to all peers are good
                     if (resetConnections)
                     { // Establish new connections on all peers
                         resetConnections = false;
-                        err = AllgatherSockReconnect( stats, true );
+                        peerTimedoutCount = 0;
                         // Redrive IOs on new peer connections
                         nsent = 0; nrecv = 0;
                         for ( int i = 0; i < GetConfigPNodesCount(); i++ )
@@ -5716,6 +5617,7 @@
                             {
                                 peer->p_sending = peer->p_receiving = true;
                                 peer->p_sent = peer->p_received = 0;
+                                peer->p_timeout_count = 0;
                                 peer->p_n2recv = -1;
                                 peer->p_buff = ((char *) rbuf) + (indexToPnid_[i] * CommBufSize);
                                 struct epoll_event event;
@@ -5786,8 +5688,11 @@
                 reconnecting = true;
                 if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
                 {
-                    trace_printf( "%s@%d" " - Reconnecting! (reconnectSeqNum_=%lld)\n"
-                                , method_name, __LINE__, reconnectSeqNum_ );
+                    trace_printf( "%s@%d" " - Reconnecting! "
+                                  "(lastReconnectErr=%d, reconnectSeqNum_=%lld)\n"
+                                , method_name, __LINE__
+                                , lastReconnectErr
+                                , reconnectSeqNum_ );
                 }
                 goto reconnected;
             }
@@ -5801,7 +5706,8 @@
                 method_name, __LINE__, epollFD_, maxEvents,
                 strerror_r( errno, ebuff, 256 ) );
             mon_log_write( MON_CLUSTER_ALLGATHERSOCK_3, SQ_LOG_CRIT, buf );
-            MPI_Abort( MPI_COMM_SELF,99 );
+
+            mon_failure_exit();
         }
 
         // Process fd's which are ready to initiate an IO or completed IO
@@ -5828,7 +5734,8 @@
                         , indexToPnid_[iPeer] >= GetConfigPNodesMax()?-1:p[indexToPnid_[iPeer]].p_sending
                         , indexToPnid_[iPeer] >= GetConfigPNodesMax()?-1:p[indexToPnid_[iPeer]].p_receiving );
                 mon_log_write( MON_CLUSTER_ALLGATHERSOCK_4, SQ_LOG_CRIT, buf );
-                MPI_Abort( MPI_COMM_SELF,99 );
+
+                mon_failure_exit();
             }
             peer_t *peer = &p[indexToPnid_[iPeer]];
             if ( (events[iEvent].events & EPOLLERR) ||
@@ -5839,28 +5746,52 @@
                 // ready for reading nor writing
                 char buf[MON_STRING_BUF_SIZE];
                 snprintf( buf, sizeof(buf)
-                        , "[%s@%d] Error: peer=%d, events[%d].data.fd=%d, event[%d]=%s\n"
+                        , "[%s@%d] Error: peer=%s(%d), events[%d].data.fd=%d, event[%d]=%s\n"
                         , method_name, __LINE__
+                        , Node[indexToPnid_[iPeer]]->GetName()
                         , indexToPnid_[iPeer]
                         , iEvent
                         , events[iEvent].data.fd
                         , iEvent
                         , EpollEventString(events[iEvent].events) );
                 mon_log_write( MON_CLUSTER_ALLGATHERSOCK_5, SQ_LOG_CRIT, buf );
-                stats[indexToPnid_[iPeer]].MPI_ERROR = MPI_ERR_EXITED;
-                err = MPI_ERR_IN_STATUS;
-                if ( peer->p_sending )
+
+                err = CheckSockPeer( indexToPnid_[iPeer], stats, peer );
+                if (err == MPI_SUCCESS)
                 {
-                    peer->p_sending = false;
-                    nsent++;
+                    if ( indexToPnid_[iPeer] == MyPNID || socks_[indexToPnid_[iPeer]] == -1 )
+                    { // peer is me or not available
+                        peer->p_sending = peer->p_receiving = false;
+                        nsent++;
+                        nrecv++;
+                    }
+                    else
+                    {
+                        doReconnect = true;
+                    }
                 }
-                if ( peer->p_receiving )
+                else
                 {
-                    peer->p_receiving = false;
-                    nrecv++;
+                    if (stats[indexToPnid_[iPeer]].MPI_ERROR == MPI_SUCCESS)
+                    {
+                        doReconnect = true;
+                        checkConnections = (IsRealCluster) ? true : false;
+                    }
+                    else
+                    {
+                        if (peer->p_sending)
+                        {
+                            nsent++;
+                            peer->p_sending = false;
+                        }
+                        if (peer->p_receiving)
+                        {
+                            peer->p_receiving = false;
+                            nrecv++;
+                        }
+                        goto early_exit;
+                    }
                 }
-                stateChange = true;
-                goto early_exit;
             }
             if ( peer->p_receiving && events[iEvent].events & EPOLLIN )
             { // Got receive (read) completion
@@ -5879,7 +5810,7 @@
                 int nr;
                 while ( 1 )
                 {
-                    if (trace_settings & TRACE_SYNC_DETAIL)
+                    if (trace_settings & TRACE_SYNC)
                     {
                         trace_printf( "%s@%d - EPOLLIN from %s(%d),"
                                       " sending=%d,"
@@ -5964,7 +5895,8 @@
                                 "[%s@%d] error n2recv %d\n",
                                 method_name, __LINE__, peer->p_n2recv );
                             mon_log_write( MON_CLUSTER_ALLGATHERSOCK_7, SQ_LOG_CRIT, buf );
-                            MPI_Abort( MPI_COMM_SELF,99 );
+
+                            mon_failure_exit();
                         }
                         if ( peer->p_n2recv == 0 )
                         {
@@ -5972,9 +5904,9 @@
                             peer->p_receiving = false;
                             nrecv++;
                             stats[indexToPnid_[iPeer]].count = peer->p_received;
-                            if (trace_settings & TRACE_SYNC_DETAIL)
+                            if (trace_settings & TRACE_SYNC)
                             {
-                                trace_printf( "%s@%d - EPOLLOUT to %s(%d),"
+                                trace_printf( "%s@%d - EPOLLIN from %s(%d),"
                                               " sending=%d,"
                                               " receiving=%d (%d)"
                                               " sent=%d,"
@@ -6004,7 +5936,7 @@
                 int ns;
                 while ( 1 )
                 {
-                    if (trace_settings & TRACE_SYNC_DETAIL)
+                    if (trace_settings & TRACE_SYNC)
                     {
                         trace_printf( "%s@%d - EPOLLOUT to %s(%d),"
                                       " sending=%d (%d),"
@@ -6057,7 +5989,7 @@
                         // finished sending to this destination
                         peer->p_sending = false;
                         nsent++;
-                        if (trace_settings & TRACE_SYNC_DETAIL)
+                        if (trace_settings & TRACE_SYNC)
                         {
                             trace_printf( "%s@%d - EPOLLOUT to %s(%d),"
                                           " sending=%d (%d),"
@@ -6082,7 +6014,7 @@
                 }
             }
 early_exit:
-            if ( stateChange )
+            if ( stateChange && (socks_[indexToPnid_[iPeer]] != -1))
             {
                 struct epoll_event event;
                 event.data.fd = socks_[indexToPnid_[iPeer]];
@@ -6105,10 +6037,42 @@
                 if ( op == EPOLL_CTL_DEL || op == EPOLL_CTL_MOD )
                 {
                     EpollCtl( epollFD_, op, fd, &event );
+                    if (op == EPOLL_CTL_DEL
+                     && stats[indexToPnid_[iPeer]].MPI_ERROR == MPI_ERR_EXITED)
+                    {
+                        CNode *node = Node[indexToPnid_[iPeer]];
+                        if (trace_settings & (TRACE_SYNC |TRACE_INIT | TRACE_RECOVERY))
+                        {
+                            trace_printf( "%s@%d - Node %s (%d) is not available, "
+                                          "removing old socket from epoll set, "
+                                          "socks_[%d]=%d\n"
+                                        , method_name, __LINE__
+                                        , node->GetName(), node->GetPNid()
+                                        , indexToPnid_[iPeer]
+                                        , socks_[indexToPnid_[iPeer]] );
+
+                        }
+                        shutdown( socks_[indexToPnid_[iPeer]], SHUT_RDWR);
+                        close( socks_[indexToPnid_[iPeer]] );
+                        socks_[indexToPnid_[iPeer]] = -1;
+                    }
                 }
             }
-        }
-    }
+            if (doReconnect)
+            {
+                reconnectSeqNum_ = seqNum_;
+                reconnecting = true;
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d" " - Reconnecting! "
+                                  "(lastReconnectErr=%d, reconnectSeqNum_=%lld)\n"
+                                , method_name, __LINE__
+                                , lastReconnectErr
+                                , reconnectSeqNum_ );
+                }
+            }
+        } // for (event)
+    } // while ( 1 )
 
     MonStats->BarrierWaitDecr( );
     inBarrier_ = false;
@@ -6119,7 +6083,7 @@
     return err;
 }
 
-int CCluster::AllgatherSockReconnect( MPI_Status *stats, bool reestablishConnections )
+int CCluster::AllgatherSockReconnect( MPI_Status *stats, peer_t *peers, bool resetConnections )
 {
     const char method_name[] = "CCluster::AllgatherSockReconnect";
     TRACE_ENTRY;
@@ -6127,23 +6091,45 @@
     int err = MPI_SUCCESS;
     int idst;
     int reconnectSock = -1;
+    int zerr = ZOK;
     CNode *node;
+    peer_t *peer;
+
+    if( !IsRealCluster )
+    { // In virtual cluster, just return success
+        TRACE_EXIT;
+        return( err );
+    }
+    
+    // Release the sync lock temporarily to allow request worker thread to
+    // process any request that needs the sync lock.
+    Monitor->ExitSyncCycle();
+    pthread_yield();
+
+    if (resetConnections)
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s@%d] Resetting sync port connections! (resetConnections=%d)\n"
+                , method_name, __LINE__, resetConnections );
+        mon_log_write( MON_CLUSTER_ALLGATHERSOCKRECONN_2, SQ_LOG_INFO, buf );
+    }
 
     // Loop on each node in the cluster
-    for ( int i = 0; i < GetConfigPNodesMax(); i++ )
+    for ( int i = 0; i < GetConfigPNodesCount(); i++ )
     {
         // Loop on each adjacent node in the cluster
-        for ( int j = i+1; j < GetConfigPNodesMax(); j++ )
+        for ( int j = i+1; j < GetConfigPNodesCount(); j++ )
         {
-            if ( i == MyPNID )
-            { // Current [i] node is my node, so connect to [j] node
+            if ( indexToPnid_[i] == MyPNID )
+            { // Current indexToPnid_[i] node is my node, so connect to indexToPnid_[j] node
 
                 idst = j;
-                node = Nodes->GetNode( idst );
+                node = Nodes->GetNode( indexToPnid_[idst] );
                 if (!node) continue;
                 if (node->GetState() != State_Up)
                 {
-                    if (socks_[idst] != -1)
+                    if (socks_[indexToPnid_[idst]] != -1)
                     { // Peer socket is still active
                         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
                         {
@@ -6152,10 +6138,10 @@
                                           "socks_[%d]=%d\n"
                                         , method_name, __LINE__
                                         , node->GetName(), node->GetPNid()
-                                        , idst, socks_[idst] );
+                                        , indexToPnid_[idst], socks_[indexToPnid_[idst]] );
                         }
-                        stats[idst].MPI_ERROR = MPI_ERR_EXITED;
-                        stats[idst].count = 0;
+                        stats[indexToPnid_[idst]].MPI_ERROR = MPI_ERR_EXITED;
+                        stats[indexToPnid_[idst]].count = 0;
                         err = MPI_ERR_IN_STATUS;
                         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
                         {
@@ -6163,97 +6149,123 @@
                                           "stats[%d].MPI_ERROR=%s\n"
                                         , method_name, __LINE__
                                         , node->GetName(), node->GetPNid()
-                                        , idst
-                                        , ErrorMsg(stats[idst].MPI_ERROR) );
+                                        , indexToPnid_[idst]
+                                        , ErrorMsg(stats[indexToPnid_[idst]].MPI_ERROR) );
                         }
 
                         --currentNodes_;
                         // Clear bit in set of "up nodes"
-                        upNodes_.upNodes[idst/MAX_NODE_BITMASK] &= ~(1ull << (idst%MAX_NODE_BITMASK));
+                        upNodes_.upNodes[indexToPnid_[idst]/MAX_NODE_BITMASK] &= ~(1ull << (indexToPnid_[idst]%MAX_NODE_BITMASK));
             
                         // Remove old socket from epoll set, it may not be there
                         struct epoll_event event;
-                        event.data.fd = socks_[idst];
+                        event.data.fd = socks_[indexToPnid_[idst]];
                         event.events = 0;
-                        EpollCtlDelete( epollFD_, socks_[idst], &event );
-                        socks_[idst] = -1;
+                        EpollCtlDelete( epollFD_, socks_[indexToPnid_[idst]], &event );
+                        shutdown( socks_[indexToPnid_[idst]], SHUT_RDWR);
+                        close( socks_[indexToPnid_[idst]] );
+                        socks_[indexToPnid_[idst]] = -1;
                     }
                     continue;
                 }
                 if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
                 {
-                    trace_printf( "%s@%d - Pinging Node %s (%d) to see if it's up\n"
+                    trace_printf( "%s@%d - Pinging node %s (%d) to see if it's up, "
+                                  "indexToPnid_[%d]=%d\n"
                                 , method_name, __LINE__
-                                , node->GetName(), node->GetPNid() );
+                                , node->GetName(), node->GetPNid()
+                                , idst, indexToPnid_[idst] );
                 }
-                if (PingSockPeer(node))
+                peer = &peers[node->GetPNid()];
+                if (PingSockPeer( node, peer->znodeFailedTime ))
                 {
-                    reconnectSock = ConnectSockPeer( node, idst, reestablishConnections );
-                    if (reconnectSock == -1)
+                    if (resetConnections)
                     {
-                        stats[idst].MPI_ERROR = MPI_ERR_EXITED;
-                        stats[idst].count = 0;
-                        err = MPI_ERR_IN_STATUS;
+                        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                        {
+                            trace_printf( "%s@%d - Connecting to node %s (%d), "
+                                          "idst=%d, indexToPnid_[%d]=%d\n"
+                                        , method_name, __LINE__
+                                        , node->GetName(), node->GetPNid()
+                                        , idst, idst
+                                        , indexToPnid_[idst] );
+                        }
+                        reconnectSock = ConnectSockPeer( node, indexToPnid_[idst], resetConnections );
+                        if (reconnectSock == -1)
+                        {
+                            stats[indexToPnid_[idst]].MPI_ERROR = MPI_ERR_EXITED;
+                            stats[indexToPnid_[idst]].count = 0;
+                            err = MPI_ERR_IN_STATUS;
+                            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                            {
+                                trace_printf( "%s@%d - Setting Node %s (%d) status to "
+                                              "stats[%d].MPI_ERROR=%s\n"
+                                            , method_name, __LINE__
+                                            , node->GetName(), node->GetPNid()
+                                            , indexToPnid_[idst]
+                                            , ErrorMsg(stats[indexToPnid_[idst]].MPI_ERROR) );
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    if ((ZClientEnabled && ZClient->IsRunningZNodeExpired( node->GetName(), zerr ))
+                     || MyNode->IsPendingNodeDown()
+                     || MyNode->GetState() != State_Up
+                     || node->GetState()   != State_Up)
+                    {
+                        if (socks_[indexToPnid_[idst]] != -1)
+                        {
+                            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                            {
+                                trace_printf( "%s@%d - Node %s (%d) is not up, "
+                                              "removing old socket from epoll set, "
+                                              "socks_[%d]=%d\n, zerr=%d"
+                                            , method_name, __LINE__
+                                            , node->GetName(), node->GetPNid()
+                                            , indexToPnid_[idst], socks_[indexToPnid_[idst]] 
+                                            , zerr );
+                            }
+    
+                            --currentNodes_;
+                            // Clear bit in set of "up nodes"
+                            upNodes_.upNodes[indexToPnid_[idst]/MAX_NODE_BITMASK] &= ~(1ull << (indexToPnid_[idst]%MAX_NODE_BITMASK));
+                
+                            // Remove old socket from epoll set, it may not be there
+                            struct epoll_event event;
+                            event.data.fd = socks_[indexToPnid_[idst]];
+                            event.events = 0;
+                            EpollCtlDelete( epollFD_, socks_[indexToPnid_[idst]], &event );
+                            shutdown( socks_[indexToPnid_[idst]], SHUT_RDWR);
+                            close( socks_[indexToPnid_[idst]] );
+                            socks_[indexToPnid_[idst]] = -1;
+                        }
+                        reconnectSock = -1;
+                        stats[indexToPnid_[idst]].MPI_ERROR = MPI_ERR_EXITED;
+                        stats[indexToPnid_[idst]].count = 0;
                         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
                         {
                             trace_printf( "%s@%d - Setting Node %s (%d) status to "
                                           "stats[%d].MPI_ERROR=%s\n"
                                         , method_name, __LINE__
                                         , node->GetName(), node->GetPNid()
-                                        , idst
-                                        , ErrorMsg(stats[idst].MPI_ERROR) );
+                                        , indexToPnid_[idst]
+                                        , ErrorMsg(stats[indexToPnid_[idst]].MPI_ERROR) );
                         }
                     }
-                }
-                else
-                {
-                    if (socks_[idst] != -1)
-                    {
-                        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-                        {
-                            trace_printf( "%s@%d - Node %s (%d) is not up, "
-                                          "removing old socket from epoll set, "
-                                          "socks_[%d]=%d\n"
-                                        , method_name, __LINE__
-                                        , node->GetName(), node->GetPNid()
-                                        , idst, socks_[idst] );
-                        }
-
-                        --currentNodes_;
-                        // Clear bit in set of "up nodes"
-                        upNodes_.upNodes[idst/MAX_NODE_BITMASK] &= ~(1ull << (idst%MAX_NODE_BITMASK));
-            
-                        // Remove old socket from epoll set, it may not be there
-                        struct epoll_event event;
-                        event.data.fd = socks_[idst];
-                        event.events = 0;
-                        EpollCtlDelete( epollFD_, socks_[idst], &event );
-                        socks_[idst] = -1;
-                    }
-                    reconnectSock = -1;
-                    stats[idst].MPI_ERROR = MPI_ERR_EXITED;
-                    stats[idst].count = 0;
                     err = MPI_ERR_IN_STATUS;
-                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-                    {
-                        trace_printf( "%s@%d - Setting Node %s (%d) status to "
-                                      "stats[%d].MPI_ERROR=%s\n"
-                                    , method_name, __LINE__
-                                    , node->GetName(), node->GetPNid()
-                                    , idst
-                                    , ErrorMsg(stats[idst].MPI_ERROR) );
-                    }
                 }
             }
             else if ( j == MyPNID )
-            { // Current [j] is my node, accept connection from peer [i] node
+            { // Current indexToPnid_[j] is my node, accept connection from peer indexToPnid_[i] node
 
                 idst = i;
-                node = Nodes->GetNode( idst );
+                node = Nodes->GetNode( indexToPnid_[idst] );
                 if (!node) continue;
                 if (node->GetState() != State_Up)
                 {
-                    if (socks_[idst] != -1)
+                    if (socks_[indexToPnid_[idst]] != -1)
                     { // Peer socket is still active
                         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
                         {
@@ -6262,10 +6274,10 @@
                                           "socks_[%d]=%d\n"
                                         , method_name, __LINE__
                                         , node->GetName(), node->GetPNid()
-                                        , idst, socks_[idst] );
+                                        , indexToPnid_[idst], socks_[indexToPnid_[idst]] );
                         }
-                        stats[idst].MPI_ERROR = MPI_ERR_EXITED;
-                        stats[idst].count = 0;
+                        stats[indexToPnid_[idst]].MPI_ERROR = MPI_ERR_EXITED;
+                        stats[indexToPnid_[idst]].count = 0;
                         err = MPI_ERR_IN_STATUS;
                         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
                         {
@@ -6273,86 +6285,99 @@
                                           "stats[%d].MPI_ERROR=%s\n"
                                         , method_name, __LINE__
                                         , node->GetName(), node->GetPNid()
-                                        , idst
-                                        , ErrorMsg(stats[idst].MPI_ERROR) );
+                                        , indexToPnid_[idst]
+                                        , ErrorMsg(stats[indexToPnid_[idst]].MPI_ERROR) );
                         }
 
                         --currentNodes_;
                         // Clear bit in set of "up nodes"
-                        upNodes_.upNodes[idst/MAX_NODE_BITMASK] &= ~(1ull << (idst%MAX_NODE_BITMASK));
+                        upNodes_.upNodes[indexToPnid_[idst]/MAX_NODE_BITMASK] &= ~(1ull << (indexToPnid_[idst]%MAX_NODE_BITMASK));
             
                         // Remove old socket from epoll set, it may not be there
                         struct epoll_event event;
-                        event.data.fd = socks_[idst];
+                        event.data.fd = socks_[indexToPnid_[idst]];
                         event.events = 0;
-                        EpollCtlDelete( epollFD_, socks_[idst], &event );
-                        socks_[idst] = -1;
+                        EpollCtlDelete( epollFD_, socks_[indexToPnid_[idst]], &event );
+                        shutdown( socks_[indexToPnid_[idst]], SHUT_RDWR);
+                        close( socks_[indexToPnid_[idst]] );
+                        socks_[indexToPnid_[idst]] = -1;
                     }
                     continue;
                 }
                 if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
                 {
-                    trace_printf( "%s@%d - Pinging Node %s (%d) to see if it's up\n"
+                    trace_printf( "%s@%d - Pinging node %s (%d) to see if it's up\n"
                                 , method_name, __LINE__
                                 , node->GetName(), node->GetPNid() );
                 }
-                if (PingSockPeer(node))
+                peer = &peers[node->GetPNid()];
+                if (PingSockPeer( node, peer->znodeFailedTime ))
                 {
-                    reconnectSock = AcceptSockPeer( node, idst, reestablishConnections );
-                    if (reconnectSock == -1)
+                    if (resetConnections)
                     {
-                        stats[idst].MPI_ERROR = MPI_ERR_EXITED;
-                        stats[idst].count = 0;
-                        err = MPI_ERR_IN_STATUS;
+                        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                        {
+                            trace_printf( "%s@%d - Accepting from node %s (%d), "
+                                          "idst=%d, indexToPnid_[%d]=%d\n"
+                                        , method_name, __LINE__
+                                        , node->GetName(), node->GetPNid()
+                                        , idst, idst
+                                        , indexToPnid_[idst] );
+                        }
+                        reconnectSock = AcceptSockPeer( stats, resetConnections );
+                        if (reconnectSock == -1)
+                        {
+                            err = MPI_ERR_IN_STATUS;
+                        }
+                    }
+                }
+                else
+                {
+                    if ((ZClientEnabled && ZClient->IsRunningZNodeExpired( node->GetName(), zerr ))
+                     || MyNode->IsPendingNodeDown()
+                     || MyNode->GetState() != State_Up
+                     || node->GetState()   != State_Up)
+                    {
+                        if (socks_[indexToPnid_[idst]] != -1)
+                        {
+                            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                            {
+                                trace_printf( "%s@%d - Node %s (%d) is not up, "
+                                              "removing old socket from epoll set, "
+                                              "socks_[%d]=%d, zerr=%d\n"
+                                            , method_name, __LINE__
+                                            , node->GetName(), node->GetPNid()
+                                            , indexToPnid_[idst], socks_[indexToPnid_[idst]]
+                                            , zerr );
+                            }
+    
+                            --currentNodes_;
+                            // Clear bit in set of "up nodes"
+                            upNodes_.upNodes[indexToPnid_[idst]/MAX_NODE_BITMASK] &= ~(1ull << (indexToPnid_[idst]%MAX_NODE_BITMASK));
+                
+                            // Remove old socket from epoll set, it may not be there
+                            struct epoll_event event;
+                            event.data.fd = socks_[indexToPnid_[idst]];
+                            event.events = 0;
+                            EpollCtlDelete( epollFD_, socks_[indexToPnid_[idst]], &event );
+                            shutdown( socks_[indexToPnid_[idst]], SHUT_RDWR);
+                            close( socks_[indexToPnid_[idst]] );
+                            socks_[indexToPnid_[idst]] = -1;
+                        }
+                        reconnectSock = -1;
+                        stats[indexToPnid_[idst]].MPI_ERROR = MPI_ERR_EXITED;
+                        stats[indexToPnid_[idst]].count = 0;
                         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
                         {
                             trace_printf( "%s@%d - Setting Node %s (%d) status to "
                                           "stats[%d].MPI_ERROR=%s\n"
                                         , method_name, __LINE__
                                         , node->GetName(), node->GetPNid()
-                                        , idst
-                                        , ErrorMsg(stats[idst].MPI_ERROR) );
+                                        , indexToPnid_[idst]
+                                        , ErrorMsg(stats[indexToPnid_[idst]].MPI_ERROR) );
                         }
                     }
-                }
-                else
-                {
-                    if (socks_[idst] != -1)
-                    {
-                        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-                        {
-                            trace_printf( "%s@%d - Node %s (%d) is not up, "
-                                          "removing old socket from epoll set, "
-                                          "socks_[%d]=%d\n"
-                                        , method_name, __LINE__
-                                        , node->GetName(), node->GetPNid()
-                                        , idst, socks_[idst] );
-                        }
-
-                        --currentNodes_;
-                        // Clear bit in set of "up nodes"
-                        upNodes_.upNodes[idst/MAX_NODE_BITMASK] &= ~(1ull << (idst%MAX_NODE_BITMASK));
-            
-                        // Remove old socket from epoll set, it may not be there
-                        struct epoll_event event;
-                        event.data.fd = socks_[idst];
-                        event.events = 0;
-                        EpollCtlDelete( epollFD_, socks_[idst], &event );
-                        socks_[idst] = -1;
-                    }
-                    reconnectSock = -1;
-                    stats[idst].MPI_ERROR = MPI_ERR_EXITED;
-                    stats[idst].count = 0;
                     err = MPI_ERR_IN_STATUS;
-                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-                    {
-                        trace_printf( "%s@%d - Setting Node %s (%d) status to "
-                                      "stats[%d].MPI_ERROR=%s\n"
-                                    , method_name, __LINE__
-                                    , node->GetName(), node->GetPNid()
-                                    , idst
-                                    , ErrorMsg(stats[idst].MPI_ERROR) );
-                    }
                 }
             }
             else
@@ -6361,14 +6386,14 @@
             }
             if ( idst >= 0
               && reconnectSock != -1
-              && socks_[idst] != -1
-              && fcntl( socks_[idst], F_SETFL, O_NONBLOCK ) )
+              && socks_[indexToPnid_[idst]] != -1
+              && fcntl( socks_[indexToPnid_[idst]], F_SETFL, O_NONBLOCK ) )
             {
                 err = MPI_ERR_AMODE;
                 char ebuff[256];
                 char buf[MON_STRING_BUF_SIZE];
                 snprintf( buf, sizeof(buf), "[%s@%d] fcntl(socks_[%d]=%d,F_SETFL,NONBLOCK) error: %s\n",
-                    method_name, __LINE__,idst, socks_[idst], strerror_r( errno, ebuff, 256 ) );
+                    method_name, __LINE__,indexToPnid_[indexToPnid_[idst]], socks_[indexToPnid_[idst]], strerror_r( errno, ebuff, 256 ) );
                 mon_log_write( MON_CLUSTER_ALLGATHERSOCKRECONN_1, SQ_LOG_CRIT, buf );
             }
         }
@@ -6389,15 +6414,17 @@
                             , ErrorMsg(stats[indexToPnid_[i]].MPI_ERROR) );
             }
         }
-        trace_printf( "%s@%d - Returning err=%d\n"
-                    , method_name, __LINE__, err );
+        trace_printf( "%s@%d - Returning err=%d(%s)\n"
+                    , method_name, __LINE__, err, ErrorMsg(err) );
     }
 
+    Monitor->EnterSyncCycle();
+
     TRACE_EXIT;
     return( err );
 }
 
-int CCluster::AcceptSockPeer( CNode *node, int peer, bool reestablishConnections )
+int CCluster::AcceptSockPeer( MPI_Status *stats, bool resetConnections )
 {
     const char method_name[] = "CCluster::AcceptSockPeer";
     TRACE_ENTRY;
@@ -6418,33 +6445,21 @@
                 , MyNode->GetName()
                 , strerror_r( h_errno, ebuff, 256 ) );
         mon_log_write( MON_CLUSTER_ACCEPTSOCKPEER_1, SQ_LOG_CRIT, buf );
-        abort();
+
+        mon_failure_exit();
     }
     else
     {
         if (trace_settings & TRACE_RECOVERY)
         {
-            trace_printf( "%s@%d Accepting server socket: from %s(%d), port=%d\n"
+            trace_printf( "%s@%d Accepting server socket on port=%d\n"
                         , method_name, __LINE__
-                        , node->GetName(), node->GetPNid()
                         , MyNode->GetSyncSocketPort() );
         }
 
         // Accept connection from peer
         reconnectSock = AcceptSock( syncSock_ );
-        if (reconnectSock != -1)
-        {
-            if (trace_settings & TRACE_RECOVERY)
-            {
-                trace_printf( "%s@%d Server %s(%d) accepted from client %s(%d), old socks_[%d]=%d, new socks_[%d]=%d\n"
-                            , method_name, __LINE__
-                            , MyNode->GetName(), MyPNID
-                            , node->GetName(), node->GetPNid()
-                            , peer, socks_[peer]
-                            , peer, reconnectSock);
-            }
-        }
-        else
+        if (reconnectSock < 0)
         {
             char buf[MON_STRING_BUF_SIZE];
             snprintf( buf, sizeof(buf), "[%s@%d] AcceptSock(%d) failed!\n",
@@ -6453,48 +6468,169 @@
             rc = -1;
         }
 
-        if (reestablishConnections)
+        if (rc != -1 && resetConnections)
         {
-            if (socks_[peer] != -1)
+            if (reconnectSock > -1)
             {
-                // Remove old socket from epoll set, it may not be there
-                struct epoll_event event;
-                event.data.fd = socks_[peer];
-                event.events = 0;
-                EpollCtlDelete( epollFD_, socks_[peer], &event );
-                if (node->GetState() != State_Up)
+                nodeSyncInfo_t readSyncInfo;
+                // Get info about connecting monitor
+                rc = ReceiveSock( (char *) &readSyncInfo
+                                , sizeof(nodeSyncInfo_t)
+                                , reconnectSock
+                                , method_name );
+                if ( rc )
+                {   // Handle error
+                    shutdown( reconnectSock, SHUT_RDWR);
+                    close( (int)reconnectSock );
+
+                    char buf[MON_STRING_BUF_SIZE];
+                    snprintf( buf, sizeof(buf)
+                            , "[%s], unable to obtain node sync infor from remote"
+                              "monitor: %s.\n"
+                            , method_name, ErrorMsg(rc));
+                    mon_log_write( MON_CLUSTER_ACCEPTSOCKPEER_4, SQ_LOG_ERR, buf );
+                }
+                else
                 {
                     if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
                     {
-                        trace_printf( "%s@%d - Node %s (%d) is not up, "
-                                      "removing old socket from epoll set, "
-                                      "socks_[%d]=%d\n"
+                        trace_printf( "%s@%d - Received remote SyncInfo.pnid=%d, "
+                                      "SyncInfo.nodeName=%s, "
+                                      "SyncInfo.seqNum=%lld, "
+                                      "SyncInfo.reconnectSeqNum=%lld\n"
                                     , method_name, __LINE__
-                                    , node->GetName(), node->GetPNid()
-                                    , peer, socks_[peer] );
+                                    , readSyncInfo.pnid
+                                    , readSyncInfo.nodeName
+                                    , readSyncInfo.seqNum
+                                    , readSyncInfo.reconnectSeqNum );
                     }
-                    socks_[peer] = -1;
+
+                    CNode *acceptedNode = Nodes->GetNode( readSyncInfo.pnid );
+                    if (!acceptedNode)
+                    {
+                        shutdown( reconnectSock, SHUT_RDWR);
+                        close( (int)reconnectSock );
+
+                        char buf[MON_STRING_BUF_SIZE];
+                        snprintf( buf, sizeof(buf), "[%s@%d] AcceptSock(%d) failed!\n",
+                            method_name, __LINE__, syncSock_ );
+                        mon_log_write( MON_CLUSTER_ACCEPTSOCKPEER_2, SQ_LOG_ERR, buf );
+                        return(-1);
+                    }
+
+                    char buf[MON_STRING_BUF_SIZE];
+                    snprintf( buf, sizeof(buf)
+                            , "[%s@%d] Resetting remote connection with %s(%d)\n"
+                            , method_name, __LINE__
+                            , acceptedNode->GetName(), acceptedNode->GetPNid() );
+                    mon_log_write( MON_CLUSTER_ACCEPTSOCKPEER_3, SQ_LOG_INFO, buf );
+
+                    nodeSyncInfo_t writeSyncInfo;
+
+                    strcpy(writeSyncInfo.nodeName, MyNode->GetName());
+                    writeSyncInfo.pnid = MyPNID;
+                    writeSyncInfo.seqNum = seqNum_;
+                    writeSyncInfo.reconnectSeqNum = reconnectSeqNum_;
+                    rc = SendSock( (char *) &writeSyncInfo
+                                 , sizeof(nodeSyncInfo_t)
+                                 , reconnectSock
+                                 , method_name );
+                    if ( rc )
+                    {
+                        shutdown( reconnectSock, SHUT_RDWR);
+                        close( (int)reconnectSock );
+    
+                        char buf[MON_STRING_BUF_SIZE];
+                        snprintf( buf, sizeof(buf)
+                                , "[%s], Cannot send sync node info to node %s: (%s)\n"
+                                , method_name
+                                , acceptedNode->GetName(), ErrorMsg(rc));
+                        mon_log_write(MON_CLUSTER_ACCEPTSOCKPEER_5, SQ_LOG_ERR, buf);
+                    }
+                    else
+                    {
+                        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                        {
+                            trace_printf( "%s@%d - Sent my SyncInfo.pnid=%d, "
+                                          "SyncInfo.nodeName=%s, "
+                                          "SyncInfo.seqNum=%lld, "
+                                          "SyncInfo.reconnectSeqNum=%lld\n"
+                                        , method_name, __LINE__
+                                        , writeSyncInfo.pnid
+                                        , writeSyncInfo.nodeName
+                                        , writeSyncInfo.seqNum
+                                        , writeSyncInfo.reconnectSeqNum );
+                        }
+
+                        if (trace_settings & TRACE_RECOVERY)
+                        {
+                            trace_printf( "%s@%d Server %s(%d) accepted from client %s(%d), old socks_[%d]=%d, new socks_[%d]=%d\n"
+                                        , method_name, __LINE__
+                                        , MyNode->GetName(), MyPNID
+                                        , acceptedNode->GetName(), acceptedNode->GetPNid()
+                                        , acceptedNode->GetPNid(), socks_[acceptedNode->GetPNid()]
+                                        , acceptedNode->GetPNid(), reconnectSock);
+                        }
+
+                        if (socks_[acceptedNode->GetPNid()] != -1)
+                        {
+                            // Remove old socket from epoll set, it may not be there
+                            struct epoll_event event;
+                            event.data.fd = socks_[acceptedNode->GetPNid()];
+                            event.events = 0;
+                            EpollCtlDelete( epollFD_, socks_[acceptedNode->GetPNid()], &event );
+                            if (acceptedNode->GetState() != State_Up)
+                            {
+                                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                                {
+                                    trace_printf( "%s@%d - Node %s (%d) is not up, "
+                                                  "removing old socket from epoll set, "
+                                                  "socks_[%d]=%d\n"
+                                                , method_name, __LINE__
+                                                , acceptedNode->GetName(), acceptedNode->GetPNid()
+                                                , acceptedNode->GetPNid(), socks_[acceptedNode->GetPNid()] );
+                                }
+                                shutdown( socks_[acceptedNode->GetPNid()], SHUT_RDWR);
+                                close( socks_[acceptedNode->GetPNid()] );
+                                socks_[acceptedNode->GetPNid()] = -1;
+                                stats[acceptedNode->GetPNid()].MPI_ERROR = MPI_ERR_EXITED;
+                                stats[acceptedNode->GetPNid()].count = 0;
+
+                                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                                {
+                                    trace_printf( "%s@%d - Setting Node %s (%d) status to "
+                                                  "stats[%d].MPI_ERROR=%s\n"
+                                                , method_name, __LINE__
+                                                , acceptedNode->GetName(), acceptedNode->GetPNid()
+                                                , acceptedNode->GetPNid()
+                                                , ErrorMsg(stats[acceptedNode->GetPNid()].MPI_ERROR) );
+                                }
+                            }
+                            else
+                            {
+                                socks_[acceptedNode->GetPNid()] = reconnectSock; // AcceptSockPeer
+                            }
+                        }
+                    }
                 }
             }
-            if (reconnectSock != -1)
-            {
-                socks_[peer] = reconnectSock; // AcceptSockPeer
-            }
         }
         else
         {
-            if (reconnectSock != -1)
+            if (reconnectSock > -1)
             {
+                shutdown( reconnectSock, SHUT_RDWR);
                 close( (int)reconnectSock );
             }
         }
     }
 
+
     TRACE_EXIT;
     return rc;
 }
 
-int CCluster::ConnectSockPeer( CNode *node, int peer, bool reestablishConnections )
+int CCluster::ConnectSockPeer( CNode *node, int peer, bool resetConnections )
 {
     const char method_name[] = "CCluster::ConnectSockPeer";
     TRACE_ENTRY;
@@ -6516,7 +6652,8 @@
                 , MyNode->GetName()
                 , strerror_r( h_errno, ebuff, 256 ) );
         mon_log_write( MON_CLUSTER_CONNECTSOCKPEER_1, SQ_LOG_CRIT, buf );
-        abort();
+
+        mon_failure_exit();
     }
     else
     {
@@ -6533,7 +6670,8 @@
                 method_name, __LINE__, node->GetName(),
                 strerror_r( h_errno, ebuff, 256 ) );
             mon_log_write( MON_CLUSTER_CONNECTSOCKPEER_2, SQ_LOG_CRIT, buf );
-            abort();
+
+            mon_failure_exit();
         }
         // Initialize peer's destination address structure
         memcpy( dstaddr, he->h_addr, 4 );
@@ -6556,7 +6694,7 @@
         }
         // Connect to peer
         reconnectSock = MkCltSock( srcaddr, dstaddr, sockPorts_[peer] );
-        if (reconnectSock != -1)
+        if (reconnectSock > -1)
         {
             if (trace_settings & TRACE_RECOVERY)
             {
@@ -6588,8 +6726,15 @@
             rc = -1;
         }
 
-        if (reestablishConnections)
+        if (rc != -1 && resetConnections)
         {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s@%d] Resetting remote connection with %s(%d)\n"
+                    , method_name, __LINE__
+                    , node->GetName(), node->GetPNid() );
+            mon_log_write( MON_CLUSTER_CONNECTSOCKPEER_4, SQ_LOG_INFO, buf );
+
             if (socks_[peer] != -1)
             {
                 // Remove old socket from epoll set, it may not be there
@@ -6608,18 +6753,92 @@
                                     , node->GetName(), node->GetPNid()
                                     , peer, socks_[peer] );
                     }
+                    shutdown( socks_[peer], SHUT_RDWR);
+                    close( socks_[peer] );
                     socks_[peer] = -1;
                 }
             }
-            if (reconnectSock != -1)
+
+            if (reconnectSock > -1)
             {
-                socks_[peer] = reconnectSock; // ConnectSockPeer
+                nodeSyncInfo_t writeSyncInfo;
+
+                strcpy(writeSyncInfo.nodeName, MyNode->GetName());
+                writeSyncInfo.pnid = MyPNID;
+                writeSyncInfo.seqNum = seqNum_;
+                writeSyncInfo.reconnectSeqNum = reconnectSeqNum_;
+                rc = SendSock( (char *) &writeSyncInfo
+                             , sizeof(nodeSyncInfo_t)
+                             , reconnectSock
+                             , method_name );
+                if ( rc )
+                {
+                    shutdown( reconnectSock, SHUT_RDWR);
+                    close( (int)reconnectSock );
+
+                    char buf[MON_STRING_BUF_SIZE];
+                    snprintf( buf, sizeof(buf)
+                            , "[%s], Cannot send sync node info to node %s: (%s)\n"
+                            , method_name
+                            , node?node->GetName():"", ErrorMsg(rc));
+                    mon_log_write(MON_CLUSTER_CONNECTSOCKPEER_5, SQ_LOG_ERR, buf);    
+                }
+                else
+                {
+                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                    {
+                        trace_printf( "%s@%d - Sent my SyncInfo.pnid=%d, "
+                                      "SyncInfo.nodeName=%s, "
+                                      "SyncInfo.seqNum=%lld, "
+                                      "SyncInfo.reconnectSeqNum=%lld\n"
+                                    , method_name, __LINE__
+                                    , writeSyncInfo.pnid
+                                    , writeSyncInfo.nodeName
+                                    , writeSyncInfo.seqNum
+                                    , writeSyncInfo.reconnectSeqNum );
+                    }
+                    nodeSyncInfo_t readSyncInfo;
+                    // Get info about connecting monitor
+                    rc = ReceiveSock( (char *) &readSyncInfo
+                                    , sizeof(nodeSyncInfo_t)
+                                    , reconnectSock
+                                    , method_name );
+                    if ( rc )
+                    {   // Handle error
+                        shutdown( reconnectSock, SHUT_RDWR);
+                        close( (int)reconnectSock );
+
+                        char buf[MON_STRING_BUF_SIZE];
+                        snprintf( buf, sizeof(buf)
+                                , "[%s], unable to obtain node sync infor from remote"
+                                  "monitor: %s.\n"
+                                , method_name, ErrorMsg(rc));
+                        mon_log_write(MON_CLUSTER_CONNECTSOCKPEER_6, SQ_LOG_ERR, buf);    
+                    }
+                    else
+                    {
+                        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                        {
+                            trace_printf( "%s@%d - Received remote SyncInfo.pnid=%d, "
+                                          "SyncInfo.nodeName=%s, "
+                                          "SyncInfo.seqNum=%lld, "
+                                          "SyncInfo.reconnectSeqNum=%lld\n"
+                                        , method_name, __LINE__
+                                        , readSyncInfo.pnid
+                                        , readSyncInfo.nodeName
+                                        , readSyncInfo.seqNum
+                                        , readSyncInfo.reconnectSeqNum );
+                        }
+                        socks_[peer] = reconnectSock; // ConnectSockPeer
+                    }
+                }
             }
         }
         else
         {
-            if (reconnectSock != -1)
+            if (reconnectSock > -1)
             {
+                shutdown( reconnectSock, SHUT_RDWR);
                 close( (int)reconnectSock );
             }
         }
@@ -7036,14 +7255,15 @@
     // Add to dead node name list
     CNode *downNode = Nodes->GetNode( pnid );
     assert(downNode);
+    if (downNode->GetState() != State_Down)
+    {
+        downNode->SetPendingNodeDown(true);
+    }
     deadNodeList_.push_back( downNode );
 
     if (trace_settings & TRACE_INIT)
         trace_printf("%s@%d - Added down node to list, pnid=%d, name=(%s)\n", method_name, __LINE__, downNode->GetPNid(), downNode->GetName());
 
-    // assign new leaders if needed
-    AssignLeaders( pnid, downNode->GetName(), false );
-
     // Build available list of spare nodes
     CNode *spareNode;
     NodesList *spareNodesList = Nodes->GetSpareNodesList();
@@ -7158,17 +7378,17 @@
 
     // Populate nodestate array using node state info from "allgather"
     // along with local node state.
-    for (int index = 0; index < GetConfigPNodesMax(); index++)
+    for (int index = 0; index < GetConfigPNodesCount(); index++)
     {
         // Only process active nodes
         bool noComm;
         switch( CommType )
         {
             case CommType_InfiniBand:
-                noComm = (comms_[index] == MPI_COMM_NULL) ? true : false;
+                noComm = (comms_[indexToPnid_[index]] == MPI_COMM_NULL) ? true : false;
                 break;
             case CommType_Sockets:
-                noComm = (socks_[index] == -1) ? true : false;
+                noComm = (socks_[indexToPnid_[index]] == -1) ? true : false;
                 break;
             default:
                 // Programmer bonehead!
@@ -7176,7 +7396,7 @@
         }
 
         if (noComm
-         || status[index].MPI_ERROR != MPI_SUCCESS)
+         || status[indexToPnid_[index]].MPI_ERROR != MPI_SUCCESS)
         {
             if (trace_settings & (TRACE_RECOVERY | TRACE_INIT))
             {
@@ -7184,54 +7404,54 @@
                 {
                     trace_printf( "%s@%d - Communication error from node %d, "
                                   " seq_num=#%lld\n"
-                                , method_name, __LINE__, index
+                                , method_name, __LINE__, indexToPnid_[index]
                                 , seqNum_ );
                 }
             }
             // Not an active node, set default values
-            nodestate[index].node_state = State_Unknown;
-            nodestate[index].change_nid = -1;
-            nodestate[index].seq_num     = 0;
+            nodestate[indexToPnid_[index]].node_state = State_Unknown;
+            nodestate[indexToPnid_[index]].change_nid = -1;
+            nodestate[indexToPnid_[index]].seq_num     = 0;
             for ( int i =0; i < MAX_NODE_MASKS ; i++ )
             {
-                nodestate[index].nodeMask.upNodes[i] = 0;
+                nodestate[indexToPnid_[index]].nodeMask.upNodes[i] = 0;
             }
 #ifdef NAMESERVER_PROCESS
-            nodestate[index].monConnCount = -1;
+            nodestate[indexToPnid_[index]].monConnCount = -1;
 #else
-            nodestate[index].monProcCount = 0;
+            nodestate[indexToPnid_[index]].monProcCount = 0;
 #endif
 
             continue;
         }
 
         recvBuf = (struct sync_buffer_def *)
-            (((char *) syncBuf) + index * CommBufSize);
+            (((char *) syncBuf) + indexToPnid_[index] * CommBufSize);
 
         if (trace_settings & TRACE_SYNC)
         {
             int nr;
-            MPI_Get_count(&status[index], MPI_CHAR, &nr);
+            MPI_Get_count(&status[indexToPnid_[index]], MPI_CHAR, &nr);
             trace_printf("%s@%d - Received %d bytes from node %d, "
-                         ", seq_num=%lld, message count=%d\n",
-                         method_name, __LINE__, nr, index,
+                         "seq_num=%lld, message count=%d\n",
+                         method_name, __LINE__, nr, indexToPnid_[index],
                          recvBuf->nodeInfo.seq_num,
                          recvBuf->msgInfo.msg_count);
         }
 
-        nodestate[index].node_state  = recvBuf->nodeInfo.node_state;
-        nodestate[index].change_nid  = recvBuf->nodeInfo.change_nid;
-        nodestate[index].seq_num     = recvBuf->nodeInfo.seq_num;
-        nodestate[index].nodeMask    = recvBuf->nodeInfo.nodeMask;
+        nodestate[indexToPnid_[index]].node_state  = recvBuf->nodeInfo.node_state;
+        nodestate[indexToPnid_[index]].change_nid  = recvBuf->nodeInfo.change_nid;
+        nodestate[indexToPnid_[index]].seq_num     = recvBuf->nodeInfo.seq_num;
+        nodestate[indexToPnid_[index]].nodeMask    = recvBuf->nodeInfo.nodeMask;
 #ifdef NAMESERVER_PROCESS
-        nodestate[index].monConnCount = recvBuf->nodeInfo.monConnCount;
+        nodestate[indexToPnid_[index]].monConnCount = recvBuf->nodeInfo.monConnCount;
 #else
-        nodestate[index].monProcCount = recvBuf->nodeInfo.monProcCount;
+        nodestate[indexToPnid_[index]].monProcCount = recvBuf->nodeInfo.monProcCount;
 #endif
 
         for ( int i =0; i < MAX_NODE_MASKS ; i++ )
         {
-            if ( nodestate[index].nodeMask.upNodes[i] != upNodes_.upNodes[i] )
+            if ( nodestate[indexToPnid_[index]].nodeMask.upNodes[i] != upNodes_.upNodes[i] ) 
             {
                 if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
                 {
@@ -7242,10 +7462,10 @@
                                       "monitor sees %llx\n"
                                     , method_name, __LINE__
                                     , seqNum_
-                                    , Node[index]->GetName()
-                                    , index
-                                    , j
-                                    , nodestate[index].nodeMask.upNodes[j]
+                                    , Node[indexToPnid_[index]]->GetName()
+                                    , indexToPnid_[index]
+                                    , j 
+                                    , nodestate[indexToPnid_[index]].nodeMask.upNodes[j]
                                     , upNodes_.upNodes[j] );
                     }
                 }
@@ -7258,28 +7478,13 @@
         {
            trace_printf( "%s@%d - Node %s (pnid=%d) TmSyncState=(%d)(%s)\n"
                        , method_name, __LINE__
-                       , Node[index]->GetName()
-                       , index
+                       , Node[indexToPnid_[index]]->GetName()
+                       , indexToPnid_[index]
                        , recvBuf->nodeInfo.tmSyncState
                        , SyncStateString( recvBuf->nodeInfo.tmSyncState ));
         }
 #endif
 
-#ifndef NAMESERVER_PROCESS
-        if ( Node[index]->GetTmSyncState() != recvBuf->nodeInfo.tmSyncState )
-        {
-            Node[index]->SetTmSyncState(recvBuf->nodeInfo.tmSyncState);
-            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-            {
-                trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated"
-                             " (%d)(%s)\n", method_name, __LINE__,
-                             Node[index]->GetName(), index,
-                             recvBuf->nodeInfo.tmSyncState,
-                             SyncStateString( recvBuf->nodeInfo.tmSyncState ));
-            }
-        }
-#endif
-
         // Check if we need to increase my node's shutdown level ...
         // all nodes should be at the highest level selected from any source
         if ( MyNode->GetShutdownLevel() < recvBuf->nodeInfo.sdLevel )
@@ -7292,35 +7497,35 @@
             if (trace_settings & (TRACE_REQUEST | TRACE_SYNC))
                 trace_printf("%s@%d - Node %s Shutdown Level updated (%d)\n",
                              method_name, __LINE__,
-                             Node[index]->GetName(), recvBuf->nodeInfo.sdLevel);
+                             Node[indexToPnid_[index]]->GetName(), recvBuf->nodeInfo.sdLevel);
         }
 
-        Node[index]->SetInternalState( recvBuf->nodeInfo.internalState );
+        Node[indexToPnid_[index]]->SetInternalState( recvBuf->nodeInfo.internalState );
         if ( recvBuf->nodeInfo.internalState == State_Ready_To_Exit )
         {   // The node is exiting.  Don't communicate with it any more.
             if (trace_settings & (TRACE_REQUEST | TRACE_SYNC))
                 trace_printf("%s@%d - Node %s (%d) ready to exit, setting comm "
                              "to null\n", method_name, __LINE__,
-                             Node[index]->GetName(), index);
+                             Node[indexToPnid_[index]]->GetName(), indexToPnid_[index]);
 
             switch( CommType )
             {
                 case CommType_InfiniBand:
-                    MPI_Comm_free( &comms_[index] );
+                    MPI_Comm_free( &comms_[indexToPnid_[index]] );
                     break;
                 case CommType_Sockets:
-                    shutdown( socks_[index], SHUT_RDWR );
-                    close( socks_[index] );
-                    socks_[index] = -1;
+                    shutdown( socks_[indexToPnid_[index]], SHUT_RDWR );
+                    close( socks_[indexToPnid_[index]] );
+                    socks_[indexToPnid_[index]] = -1;
                     break;
                 default:
                     // Programmer bonehead!
                     abort();
             }
-            Node[index]->SetState( State_Down );
+            Node[indexToPnid_[index]]->SetState( State_Down );
             --currentNodes_;
             // Clear bit in set of "up nodes"
-            upNodes_.upNodes[index/MAX_NODE_BITMASK] &= ~(1ull << (index%MAX_NODE_BITMASK));
+            upNodes_.upNodes[indexToPnid_[index]/MAX_NODE_BITMASK] &= ~(1ull << (indexToPnid_[index]%MAX_NODE_BITMASK));
         }
     }
 
@@ -7335,7 +7540,16 @@
                      "incorrect.  Aborting!\n", method_name, seqNum_);
             mon_log_write(MON_CLUSTER_UPDTCLUSTERSTATE_1, SQ_LOG_CRIT, buf);
             mem_log_write(CMonLog::MON_UPDATE_CLUSTER_2, MyPNID);
-            abort();
+            if( IsRealCluster )
+            { // Terminate CommAccept thread, remote pings will fail
+                CommAccept.shutdownWork();
+                if ( ZClientEnabled )
+                {
+                    ZClient->RunningZNodeDelete( MyNode->GetName() );
+                    ZClient->MasterZNodeDelete( MyNode->GetName() );
+                }
+            }
+            mon_failure_exit();
         }
     }
 
@@ -7350,16 +7564,16 @@
 #endif
 
     // Examine status returned from MPI receive requests
-    for (int index = 0; index < GetConfigPNodesMax(); index++)
+    for (int index = 0; index < GetConfigPNodesCount(); index++)
     {
         bool noComm;
         switch( CommType )
         {
             case CommType_InfiniBand:
-                noComm = (comms_[index] == MPI_COMM_NULL) ? true : false;
+                noComm = (comms_[indexToPnid_[index]] == MPI_COMM_NULL) ? true : false;
                 break;
             case CommType_Sockets:
-                noComm = (socks_[index] == -1) ? true : false;
+                noComm = (socks_[indexToPnid_[index]] == -1) ? true : false;
                 break;
             default:
                 // Programmer bonehead!
@@ -7367,29 +7581,29 @@
         }
         if (noComm) continue;
 
-        if (status[index].MPI_ERROR != MPI_SUCCESS)
+        if (status[indexToPnid_[index]].MPI_ERROR != MPI_SUCCESS)
         {
             char buf[MON_STRING_BUF_SIZE];
             snprintf(buf, sizeof(buf), "[%s] MPI communications error=%d "
                      "(%s) for node %d (at seq #%lld).\n", method_name,
-                     status[index].MPI_ERROR, ErrorMsg(status[index].MPI_ERROR),
-                     index,  seqNum_);
-            mon_log_write(MON_CLUSTER_UPDTCLUSTERSTATE_2, SQ_LOG_ERR, buf);
+                     status[indexToPnid_[index]].MPI_ERROR, ErrorMsg(status[indexToPnid_[index]].MPI_ERROR),
+                     indexToPnid_[index],  seqNum_);
+            mon_log_write(MON_CLUSTER_UPDTCLUSTERSTATE_2, SQ_LOG_ERR, buf); 
 
-            if ( status[index].MPI_ERROR == MPI_ERR_EXITED )
+            if ( status[indexToPnid_[index]].MPI_ERROR == MPI_ERR_EXITED )
             {   // A monitor has gone away
 
-                mem_log_write(CMonLog::MON_UPDATE_CLUSTER_1, index);
+                mem_log_write(CMonLog::MON_UPDATE_CLUSTER_1, indexToPnid_[index]);
 
                 switch( CommType )
                 {
                     case CommType_InfiniBand:
-                        MPI_Comm_free( &comms_[index] );
+                        MPI_Comm_free( &comms_[indexToPnid_[index]] );
                         break;
                     case CommType_Sockets:
-                        shutdown( socks_[index], SHUT_RDWR );
-                        close( socks_[index] );
-                        socks_[index] = -1;
+                        shutdown( socks_[indexToPnid_[index]], SHUT_RDWR );
+                        close( socks_[indexToPnid_[index]] );
+                        socks_[indexToPnid_[index]] = -1;
                         break;
                     default:
                         // Programmer bonehead!
@@ -7398,21 +7612,21 @@
                 --currentNodes_;
 
                 // Clear bit in set of "up nodes"
-                upNodes_.upNodes[index/MAX_NODE_BITMASK] &= ~(1ull << (index%MAX_NODE_BITMASK));
+                upNodes_.upNodes[indexToPnid_[index]/MAX_NODE_BITMASK] &= ~(1ull << (indexToPnid_[index]%MAX_NODE_BITMASK));
 
                 // Pretend node is still up until down node processing
                 // completes.
-                nodestate[index].node_state = State_Unknown;
-                nodestate[index].change_nid  = -1;
-                nodestate[index].seq_num     = 0;
+                nodestate[indexToPnid_[index]].node_state = State_Unknown;
+                nodestate[indexToPnid_[index]].change_nid  = -1;
+                nodestate[indexToPnid_[index]].seq_num     = 0;
                 for ( int i =0; i < MAX_NODE_MASKS ; i++ )
                 {
-                    nodestate[index].nodeMask.upNodes[i] = 0;
+                    nodestate[indexToPnid_[index]].nodeMask.upNodes[i] = 0;
                 }
 #ifdef NAMESERVER_PROCESS
-                nodestate[index].monConnCount = -1;
+                nodestate[indexToPnid_[index]].monConnCount = -1;
 #else
-                nodestate[index].monProcCount = 0;
+                nodestate[indexToPnid_[index]].monProcCount = 0;
 #endif
 
                 if ( validateNodeDown_ )
@@ -7422,16 +7636,16 @@
                         trace_printf( "%s@%d Divergence, queueing "
                                       "monExited{%d, %d, %lld}\n"
                                     , method_name, __LINE__
-                                    , index, MyPNID, seqNum_ );
+                                    , indexToPnid_[index], MyPNID, seqNum_ );
                     }
                     // Save info for the exited monitor so can confirm
                     // that all monitors have the same view.
-                    monExited_t monExited = {index, MyPNID, seqNum_};
+                    monExited_t monExited = {indexToPnid_[index], MyPNID, seqNum_};
                     exitedMons_.push_back( monExited );
                 }
                 else
                 {
-                    HandleDownNode(index);
+                    HandleDownNode(indexToPnid_[index]);
                 }
             }
         }
@@ -7453,12 +7667,21 @@
 #endif
 
     // Update our node states
-    for (int index = 0; index < GetConfigPNodesMax(); index++)
+    for (int index = 0; index < GetConfigPNodesCount(); index++)
     {
-        node_state = (STATE)nodestate[index].node_state;
-        change_nid = nodestate[index].change_nid;
-
-        if ( index == MyPNID &&
+        node_state = (STATE)nodestate[indexToPnid_[index]].node_state;
+        change_nid = nodestate[indexToPnid_[index]].change_nid;
+#if 0
+        // Temporary trace - debugging only
+        if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
+        {
+            trace_printf("%s@%d indexToPnid_[%d]=%d, MyPNID=%d,"
+                         "mystate=%d(%s), myseqNum_=%lld\n", method_name, __LINE__,
+                         index, indexToPnid_[index], MyPNID, MyNode->GetState(), 
+                         StateString(MyNode->GetState()), seqNum_ );
+        }
+#endif
+        if ( indexToPnid_[index] == MyPNID && 
              MyNode->GetState() == State_Merged && seqNum_ == 1)
         {   // Initial "allgather" for this re-integrated monitor.
 
@@ -7468,7 +7691,7 @@
             {
                 trace_printf("%s@%d Completed initial allgather for pnid=%d, "
                              "state=%d(%s), seqNum_=%lld\n", method_name, __LINE__,
-                             index, MyNode->GetState(),
+                             indexToPnid_[index], MyNode->GetState(), 
                              StateString(MyNode->GetState()), seqNum_ );
             }
 
@@ -7553,7 +7776,7 @@
                             break;
                         default:
                             // Programmer bonehead!
-                            MPI_Abort(MPI_COMM_SELF,99);
+                            abort();
                     }
                     pnode->SetState( State_Merged );
                     ReqQueue.enqueueUpReq( change_nid,
@@ -7592,14 +7815,14 @@
         case State_Stopped:
         case State_Shutdown:
             if (trace_settings & TRACE_SYNC_DETAIL)
-                trace_printf("%s@%d - Node %d is stopping.\n", method_name, __LINE__, index);
-            Node[index]->SetState( (STATE) node_state );
+                trace_printf("%s@%d - Node %d is stopping.\n", method_name, __LINE__, indexToPnid_[index]);
+            Node[indexToPnid_[index]]->SetState( (STATE) node_state );
             doShutdown = true;
             break;
         default:
             if (trace_settings & TRACE_SYNC)
                 trace_printf("%s@%d - Node %d in unknown state (%d).\n",
-                             method_name, __LINE__, index, node_state);
+                             method_name, __LINE__, indexToPnid_[index], node_state);
         }
     }
 
@@ -7634,53 +7857,50 @@
 }
 
 bool CCluster::ProcessClusterData( struct sync_buffer_def * syncBuf,
-                                   struct sync_buffer_def * sendBuf,
-                                   bool deferredTmSync )
+                                   struct sync_buffer_def * sendBuf )
 {
     const char method_name[] = "CCluster::ProcessClusterData";
     TRACE_ENTRY;
 
     // Using the data returned from Allgather, process replication data
-    // from all nodes.  If there are any TmSync messages from other
-    // nodes, defer processing until all other replicated data are
-    // processed.
+    // from all nodes.
     struct internal_msg_def *msg;
     struct sync_buffer_def *msgBuf;
-    bool haveDeferredTmSync = false;
+    bool rs = false;
 
-    for (int i = 0; i < GetConfigPNodesMax(); i++)
+    for (int i = 0; i < GetConfigPNodesCount(); i++)
     {
         bool noComm;
         switch( CommType )
         {
             case CommType_InfiniBand:
-                noComm = (comms_[i] == MPI_COMM_NULL) ? true : false;
+                noComm = (comms_[indexToPnid_[i]] == MPI_COMM_NULL) ? true : false;
                 break;
             case CommType_Sockets:
-                noComm = (socks_[i] == -1) ? true : false;
+                noComm = (socks_[indexToPnid_[i]] == -1) ? true : false;
                 break;
             default:
                 // Programmer bonehead!
                 abort();
         }
         // Only process active nodes
-        if (noComm && i != MyPNID) continue;
+        if (noComm && indexToPnid_[i] != MyPNID) continue;
 
-        if ( i == MyPNID )
+        if ( indexToPnid_[i] == MyPNID )
         {   // Get pointer to message sent by this node
             msgBuf = sendBuf;
         }
         else
-        {   // Compute pointer to receive buffer element for node "i"
+        {   // Compute pointer to receive buffer element for node "indexToPnid_[i]"
             msgBuf = (struct sync_buffer_def *)
-                (((char *) syncBuf) + i * CommBufSize);
+                (((char *) syncBuf) + indexToPnid_[i] * CommBufSize);
         }
 
         if (trace_settings & TRACE_SYNC)
         {
             trace_printf("%s@%d - Buffer for node %d, swpRecCount_=%d, seq_num=%lld, "
                          "lastSeqNum_=%lld, msg_count=%d, msg_offset=%d\n",
-                         method_name, __LINE__, i, swpRecCount_,
+                         method_name, __LINE__, indexToPnid_[i], swpRecCount_,
                          msgBuf->nodeInfo.seq_num,
                          lastSeqNum_,
                          msgBuf->msgInfo.msg_count,
@@ -7688,13 +7908,26 @@
         }
 
         // if we have already processed buffer, skip it
-        if (lastSeqNum_ >= msgBuf->nodeInfo.seq_num) continue;
+        if (lastSeqNum_ >= msgBuf->nodeInfo.seq_num)
+        {
+            if (trace_settings & TRACE_SYNC)
+            {
+                trace_printf("%s@%d - Already processed buffer for node %d, swpRecCount_=%d, seq_num=%lld, "
+                             "lastSeqNum_=%lld, msg_count=%d, msg_offset=%d\n",
+                             method_name, __LINE__, indexToPnid_[i], swpRecCount_,
+                             msgBuf->nodeInfo.seq_num,
+                             lastSeqNum_,
+                             msgBuf->msgInfo.msg_count,
+                             msgBuf->msgInfo.msg_offset);
+            }
+            continue;
+        }
 
         if (trace_settings & TRACE_SYNC)
         {
             trace_printf("%s@%d - Processing buffer for node %d, swpRecCount_=%d, seq_num=%lld, "
                          "lastSeqNum_=%lld, msg_count=%d, msg_offset=%d\n",
-                         method_name, __LINE__, i, swpRecCount_,
+                         method_name, __LINE__, indexToPnid_[i], swpRecCount_,
                          msgBuf->nodeInfo.seq_num,
                          lastSeqNum_,
                          msgBuf->msgInfo.msg_count,
@@ -7704,63 +7937,41 @@
         // reset msg length to zero to initialize for PopMsg()
         msgBuf->msgInfo.msg_offset = 0;
 
-#ifndef NAMESERVER_PROCESS
-        if ( msgBuf->msgInfo.msg_count == 1
-        && (( internal_msg_def *)msgBuf->msg)->type == InternalType_Sync )
+        if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
         {
-            if ( deferredTmSync )
-            {   // This node has sent a TmSync message.  Process it now.
-                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                    trace_printf("%s@%d - Handling deferred TmSync messages for "
-                                 "node %d\n", method_name, __LINE__, i);
-
-                struct internal_msg_def *msg;
-                msg = Nodes->PopMsg( msgBuf );
-
-                if ( i == MyPNID )
-                    HandleMyNodeMsg (msg, MyPNID);
-                else
-                    HandleOtherNodeMsg (msg, i);
+            if (msgBuf->msgInfo.msg_count)
+            {
+                trace_printf( "%s@%d - Handling %d message(s) for node %d (seq_num=%lld)\n"
+                             , method_name, __LINE__
+                             , msgBuf->msgInfo.msg_count
+                             , indexToPnid_[i]
+                             , msgBuf->nodeInfo.seq_num );
             }
             else
             {
-                // This node has sent a TmSync message.  Defer processing
-                // until we handle all of the non-TmSync messages from
-                // other nodes.
-                haveDeferredTmSync = true;
-
-                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                    trace_printf("%s@%d - Deferring TmSync processing for node"
-                                 " %d until replicated data is handled\n",
-                                 method_name, __LINE__, i);
+                trace_printf( "%s@%d - No messages for node %d (seq_num=%lld)\n"
+                             , method_name, __LINE__
+                             , indexToPnid_[i]
+                             , msgBuf->nodeInfo.seq_num );
             }
         }
-        else if ( !deferredTmSync )
-#else
-        if ( !deferredTmSync )
-#endif
+        do
         {
-            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                trace_printf("%s@%d - Handling messages for "
-                             "node %d\n", method_name, __LINE__, i);
-            do
-            {
-                // Get the next sync msg for the node
-                msg = Nodes->PopMsg( msgBuf );
-                if (msg->type == InternalType_Null) break;
+            // Get the next sync msg for the node
+            msg = Nodes->PopMsg( msgBuf );
+            if (msg->type == InternalType_Null) break;
 
-                if ( i == MyPNID )
-                    HandleMyNodeMsg (msg, MyPNID);
-                else
-                    HandleOtherNodeMsg (msg, i);
-            }
-            while ( true );
+            if ( indexToPnid_[i] == MyPNID )
+                HandleMyNodeMsg (msg, MyPNID);
+            else
+                HandleOtherNodeMsg (msg, indexToPnid_[i]);
         }
+        while ( true );
     }
 
     TRACE_EXIT;
 
-    return haveDeferredTmSync;
+    return( rs );
 }
 
 bool CCluster::checkIfDone (  )
@@ -7812,7 +8023,7 @@
 #endif
 
 #ifdef NAMESERVER_PROCESS
-    if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC))
+    if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_SYNC))
         trace_printf("%s@%d - Node %d shutdown level=%d, state=%s.  Process "
                      "count=%d, internal state=%d, currentNodes_=%d, "
                      "local process count=%d, shutdownNameServer=%d, "
@@ -7831,7 +8042,7 @@
 #else
     if (NameServerEnabled)
     {
-        if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC))
+        if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_SYNC))
             trace_printf("%s@%d - Node %d shutdown level=%d, state=%s.  Cluster process "
                          "count=%d, internal state=%d, currentNodes_=%d, "
                          "local process count=%d\n",
@@ -7844,7 +8055,7 @@
     }
     else
     {
-        if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC))
+        if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_SYNC))
             trace_printf("%s@%d - Node %d shutdown level=%d, state=%s.  Process "
                          "count=%d, internal state=%d, currentNodes_=%d, "
                          "local process count=%d\n",
@@ -7880,7 +8091,7 @@
 #else
             if ( NameServerEnabled )
             {
-                
+
                 if ( clusterProcCount_ == 0 )  // all Name Servers exited
                 {
                     if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC))
@@ -7967,6 +8178,10 @@
                     HealthCheck.setState(MON_STOP_WATCHDOG);
                     // let the watchdog process exit
                     HealthCheck.setState(MON_EXIT_PRIMITIVES);
+                    if ( ZClientEnabled )
+                    {
+                        ZClient->StateSet( CZClient::ZC_SHUTDOWN ); // Disable Zookeeper client
+                    }
                 }
             }
 #endif
@@ -7982,6 +8197,10 @@
 
         MyNode->SetState( State_Stopped );
         MyNode->SetInternalState(State_Ready_To_Exit);
+        if ((ZClientEnabled) && (ZClient != NULL))
+        {
+            ZClient->StateSet( CZClient::ZC_SHUTDOWN ); // Disable Zookeeper client
+        }
         // we need to sync one more time so other nodes see our state
         return false;
     }
@@ -8149,6 +8368,9 @@
     // Initialize sync buffer header including node state
     msg = Nodes->InitSyncBuffer( send_buffer, seqNum_, upNodes_ );
 
+    // Initialize recv buffer
+    Nodes->InitRecvBuffer( recv_buffer );
+
     // Fill sync buffer based on queue of replication requests
     Replicator.FillSyncBuffer ( msg );
 
@@ -8160,7 +8382,7 @@
                       "seqNum_=%lld, lastSeqNum_=%lld, lowSeqNum_=%lld, "
                       "highSeqNum_=%lld, reconnectSeqNum_=%lld\n"
                     , method_name, __LINE__
-                    , Nodes->GetSyncSize()
+                    , Nodes->GetSyncSize(send_buffer)
                     , swpRecCount_
                     , send_buffer->msgInfo.msg_count
                     , send_buffer->nodeInfo.seq_num
@@ -8175,7 +8397,7 @@
 
 
     // Exchange info with other nodes
-    err = Allgather(Nodes->GetSyncSize(), send_buffer, (char *)recv_buffer,
+    err = Allgather(Nodes->GetSyncSize(send_buffer), send_buffer, (char *)recv_buffer,
              0 /*seqNum_*/, status );
 
     struct timespec ts_ag_end;
@@ -8208,21 +8430,21 @@
             if ( trace_settings & TRACE_SYNC )
             {
                 trace_printf("%s@%d - slow Allgather info: sync size=%d, message count=%d, MyPNID=%d\n",
-                             method_name, __LINE__,  Nodes->GetSyncSize(),
+                             method_name, __LINE__,  Nodes->GetSyncSize(send_buffer),
                              send_buffer->msgInfo.msg_count, MyPNID);
                 struct sync_buffer_def *msgBuf;
                 int nr;
 
-                for (int i = 0; i < GetConfigPNodesMax(); i++)
+                for (int i = 0; i < GetConfigPNodesCount(); i++)
                 {
                     bool noComm;
                     switch( CommType )
                     {
                         case CommType_InfiniBand:
-                            noComm = (comms_[i] == MPI_COMM_NULL) ? true : false;
+                            noComm = (comms_[indexToPnid_[i]] == MPI_COMM_NULL) ? true : false;
                             break;
                         case CommType_Sockets:
-                            noComm = (socks_[i] == -1) ? true : false;
+                            noComm = (socks_[indexToPnid_[i]] == -1) ? true : false;
                             break;
                         default:
                             // Programmer bonehead!
@@ -8232,12 +8454,12 @@
                     if (noComm) continue;
 
                     msgBuf = (struct sync_buffer_def *)
-                        (((char *) recv_buffer) + i * CommBufSize);
+                        (((char *) recv_buffer) + indexToPnid_[i] * CommBufSize);
 
-                    MPI_Get_count(&status[i], MPI_CHAR, &nr);
+                    MPI_Get_count(&status[indexToPnid_[i]], MPI_CHAR, &nr);
 
                     trace_printf("%s@%d - slow Allgather info, pnid=%d: received bytes=%d, message count=%d, msg_offset=%d\n",
-                                 method_name, __LINE__, i, nr,
+                                 method_name, __LINE__, indexToPnid_[i], nr,
                                  msgBuf->msgInfo.msg_count,
                                  msgBuf->msgInfo.msg_offset);
                 }
@@ -8318,18 +8540,21 @@
         }
     }
 
-    if ( ProcessClusterData( recv_buffer, send_buffer, false ) )
-    {   // There is a TmSync message remaining to be handled
-        ProcessClusterData( recv_buffer, send_buffer, true );
-    }
-
     if (swpRecCount_ == 1)
     {
         // Save the sync buffer and corresponding sequence number we just processed
         // On reconnect we must resend the last buffer and the current buffer
         // to ensure dropped buffers are processed by all monitor processe in the
-        // correct order
+        // correct order.
+        // Note: ProcessClusterData() modifies the contents of the send buffer
+        //       so we must save the buffer prior to processing it
         Nodes->SaveMyLastSyncBuffer();
+    }
+
+    ProcessClusterData( recv_buffer, send_buffer );
+
+    if (swpRecCount_ == 1)
+    {
         lastSeqNum_ = seqNum_;
 
         // Increment count of "Allgather" calls.  If wrap-around, start again at 1.
@@ -8357,264 +8582,6 @@
     return result;
 }
 
-#ifndef NAMESERVER_PROCESS
-void CCluster::exchangeTmSyncData ( struct sync_def *sync, bool bumpSync )
-{
-    const char method_name[] = "CCluster::exchangeTmSyncData";
-    TRACE_ENTRY;
-
-    ++swpRecCount_; // recursive count for this function
-
-    bool doShutdown = false;
-    bool lastAllgatherWithLastSyncBuffer = false;
-
-    struct internal_msg_def *msg;
-    MPI_Status status[GetConfigPNodesMax()];
-    int err;
-    struct sync_buffer_def *recv_buffer;
-    struct sync_buffer_def *send_buffer = Nodes->GetSyncBuffer();
-    unsigned long long savedSeqNum = 0;
-
-    // if we are here in a second recursive call that occurred while
-    // processing TMSync data, use the second receive buffer
-    // else, use the first one.
-    if (swpRecCount_ == 1)
-    {
-      recv_buffer = recvBuffer_;
-    }
-    else
-    {
-      // should not be here in more than one recursive call.
-      assert(swpRecCount_ == 2);
-      recv_buffer = recvBuffer2_;
-    }
-
-    if (bumpSync)
-    {
-        // Save the sync buffer and corresponding sequence number we just processed
-        // On reconnect we must resend the last buffer and the current buffer
-        // to ensure dropped buffers are processed by all monitor processe in the
-        // correct order
-        Nodes->SaveMyLastSyncBuffer();
-        lastSeqNum_ = seqNum_;
-
-        // Increment count of "Allgather" calls.  If wrap-around, start again at 1.
-        if ( ++seqNum_ == 0) seqNum_ = 1;
-
-        if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-            trace_printf( "%s@%d - Bumping sequence number, "
-                          "swpRecCount_=%d, seqNum_=%lld, lastSeqNum_=%lld\n"
-                        , method_name, __LINE__
-                        , swpRecCount_
-                        , seqNum_
-                        , lastSeqNum_);
-
-    }
-
-    // Initialize sync buffer header including node state
-    msg = Nodes->InitSyncBuffer( send_buffer, seqNum_, upNodes_ );
-
-    // Add tmsync data
-    AddTmsyncMsg( send_buffer, sync, msg );
-
-reconnected:
-
-    if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-        trace_printf( "%s@%d - doing Allgather size=%d, swpRecCount_=%d, "
-                      "message count=%d, message seq_num=%lld, "
-                      "seqNum_=%lld, lastSeqNum_=%lld, lowSeqNum_=%lld, "
-                      "highSeqNum_=%lld, reconnectSeqNum_=%lld\n"
-                    , method_name, __LINE__
-                    , Nodes->GetSyncSize()
-                    , swpRecCount_
-                    , send_buffer->msgInfo.msg_count
-                    , send_buffer->nodeInfo.seq_num
-                    , seqNum_
-                    , lastSeqNum_
-                    , lowSeqNum_
-                    , highSeqNum_
-                    , reconnectSeqNum_);
-
-    struct timespec ts_ag_begin;
-    clock_gettime(CLOCK_REALTIME, &ts_ag_begin);
-
-
-    // Exchange info with other nodes
-    err = Allgather(Nodes->GetSyncSize(), send_buffer, (char *)recv_buffer,
-             0 /*seqNum_*/, status );
-
-    struct timespec ts_ag_end;
-    clock_gettime(CLOCK_REALTIME, &ts_ag_end);
-
-    if (err != MPI_SUCCESS && err != MPI_ERR_IN_STATUS)
-    {
-        if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-        {
-            trace_printf("%s@%d - unexpected Allgather error=%s (%d)\n",
-                         method_name, __LINE__, ErrorMsg(err), err);
-        }
-
-        char buf[MON_STRING_BUF_SIZE];
-        snprintf(buf, sizeof(buf), "[%s], Unexpected MPI communications "
-                 "error=%s (%d).\n", method_name, ErrorMsg(err), err);
-        mon_log_write(MON_CLUSTER_EXCHANGETMSYNC_1, SQ_LOG_ERR, buf);
-
-        // Allgather() failed in a fundamental way, bring this node down
-        if ( !enqueuedDown_ )
-        {
-            enqueuedDown_ = true;
-            ReqQueue.enqueueDownReq(MyPNID);
-        }
-    }
-    else
-    {
-        if (agTimeStats( ts_ag_begin, ts_ag_end))
-        {  // Slow cycle, print info
-            if ( trace_settings & TRACE_SYNC )
-            {
-                trace_printf("%s@%d - slow Allgather info: sync size=%d, message count=%d, MyPNID=%d\n",
-                             method_name, __LINE__,  Nodes->GetSyncSize(),
-                             send_buffer->msgInfo.msg_count, MyPNID);
-                struct sync_buffer_def *msgBuf;
-                int nr;
-
-                for (int i = 0; i < GetConfigPNodesMax(); i++)
-                {
-                    bool noComm;
-                    switch( CommType )
-                    {
-                        case CommType_InfiniBand:
-                            noComm = (comms_[i] == MPI_COMM_NULL) ? true : false;
-                            break;
-                        case CommType_Sockets:
-                            noComm = (socks_[i] == -1) ? true : false;
-                            break;
-                        default:
-                            // Programmer bonehead!
-                            abort();
-                    }
-                    // Only process active nodes
-                    if (noComm) continue;
-
-                    msgBuf = (struct sync_buffer_def *)
-                        (((char *) recv_buffer) + i * CommBufSize);
-
-                    MPI_Get_count(&status[i], MPI_CHAR, &nr);
-
-                    trace_printf("%s@%d - slow Allgather info, pnid=%d: received bytes=%d, message count=%d, msg_offset=%d\n",
-                                 method_name, __LINE__, i, nr,
-                                 msgBuf->msgInfo.msg_count,
-                                 msgBuf->msgInfo.msg_offset);
-                }
-            }
-        }
-
-        UpdateClusterState( doShutdown
-                          , recv_buffer
-                          , status
-                          , send_buffer->nodeInfo.change_nid);
-
-        if ( lastAllgatherWithLastSyncBuffer )
-        {
-            seqNum_ = savedSeqNum;
-            lastAllgatherWithLastSyncBuffer = false;
-            send_buffer = Nodes->GetSyncBuffer();
-
-            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                trace_printf( "%s@%d - Resetting lastAllgatherWithLastSyncBuffer=%d\n"
-                            , method_name, __LINE__
-                            , lastAllgatherWithLastSyncBuffer);
-
-            goto reconnected;
-        }
-
-        if ( reconnectSeqNum_ != 0 )
-        {
-            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                trace_printf( "%s@%d - Allgather IO retry, swpRecCount_=%d, "
-                              "seqNum_=%lld, lastSeqNum_=%lld, lowSeqNum_=%lld, "
-                              "highSeqNum_=%lld, reconnectSeqNum_=%lld\n"
-                            , method_name, __LINE__
-                            , swpRecCount_
-                            , seqNum_
-                            , lastSeqNum_
-                            , lowSeqNum_
-                            , highSeqNum_
-                            , reconnectSeqNum_);
-
-            // The Allgather() has executed a reconnect at reconnectSeqNum_.
-            // The UpdateClusterState has set the lowSeqNum_and highSeqNum_
-            // in the current IO exchange which will indicate whether there is
-            // a mismatch in IOs between monitor processes. If there is a mismatch,
-            // the lowSeqNum_and highSeqNum_ relative to our current seqNum_
-            // will determine how to redrive the exchange of node data.
-            if (seqNum_ > lowSeqNum_)
-            { // A remote monitor did not receive our last SyncBuffer
-                // Redo exchange with the previous SyncBuffer
-                send_buffer = Nodes->GetLastSyncBuffer();
-                savedSeqNum = seqNum_;
-                seqNum_ = lastSeqNum_;
-                // Indicate to follow up the next exchange with current SyncBuffer
-                lastAllgatherWithLastSyncBuffer = true;
-                lowSeqNum_ = highSeqNum_ = reconnectSeqNum_ = 0;
-
-                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                    trace_printf( "%s@%d - Setting lastAllgatherWithLastSyncBuffer=%d\n"
-                                , method_name, __LINE__
-                                , lastAllgatherWithLastSyncBuffer);
-
-                goto reconnected;
-            }
-            else if (seqNum_ < highSeqNum_)
-            { // The local monitor did not receive the last remote SyncBuffer
-                // Redo exchange with the current SyncBuffer
-                send_buffer = Nodes->GetSyncBuffer();
-                lowSeqNum_ = highSeqNum_ = reconnectSeqNum_ = 0;
-
-                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                    trace_printf( "%s@%d - lastAllgatherWithLastSyncBuffer=%d\n"
-                                , method_name, __LINE__
-                                , lastAllgatherWithLastSyncBuffer);
-
-                goto reconnected;
-            }
-            lowSeqNum_ = highSeqNum_ = reconnectSeqNum_ = 0;
-        }
-    }
-
-    if ( ProcessClusterData( recv_buffer, send_buffer, false ) )
-    {   // There is a TmSync message remaining to be handled
-        ProcessClusterData( recv_buffer, send_buffer, true );
-    }
-
-    if (swpRecCount_ == 1)
-    {
-        // Save the sync buffer and corresponding sequence number we just processed
-        // On reconnect we must resend the last buffer and the current buffer
-        // to ensure dropped buffers are processed by all monitor processe in the
-        // correct order
-        Nodes->SaveMyLastSyncBuffer();
-        lastSeqNum_ = seqNum_;
-
-        // Increment count of "Allgather" calls.  If wrap-around, start again at 1.
-        if ( ++seqNum_ == 0) seqNum_ = 1;
-    }
-
-    if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-        trace_printf( "%s@%d - node data exchange completed, swpRecCount_=%d, "
-                      "seqNum_=%lld, lastSeqNum_=%lld, reconnectSeqNum_=%lld\n"
-                    , method_name, __LINE__
-                    , swpRecCount_
-                    , seqNum_
-                    , lastSeqNum_
-                    , reconnectSeqNum_);
-
-    --swpRecCount_;
-
-    TRACE_EXIT;
-}
-#endif
-
 void CCluster::EpollCtl( int efd, int op, int fd, struct epoll_event *event )
 {
     const char method_name[] = "CCluster::EpollCtl";
@@ -8623,15 +8590,15 @@
     if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
     {
         int iPeer;
-        for ( iPeer = 0; iPeer < GetConfigPNodesMax(); iPeer++ )
+        for ( iPeer = 0; iPeer < GetConfigPNodesCount(); iPeer++ )
         { // Find corresponding peer by matching socket fd
-            if ( fd == socks_[iPeer] ) break;
+            if ( fd == socks_[indexToPnid_[iPeer]] ) break;
         }
         trace_printf( "%s@%d epoll_ctl( efd=%d,%s, fd=%d(%s), %s )\n"
                     , method_name, __LINE__
                     , efd
                     , EpollOpString(op)
-                    , fd, Node[iPeer]->GetName()
+                    , fd, Node[indexToPnid_[iPeer]]->GetName()
                     , EpollEventString(event->events) );
     }
 #endif
@@ -8641,19 +8608,20 @@
         char ebuff[256];
         char buf[MON_STRING_BUF_SIZE];
         int iPeer;
-        for ( iPeer = 0; iPeer < GetConfigPNodesMax(); iPeer++ )
+        for ( iPeer = 0; iPeer < GetConfigPNodesCount(); iPeer++ )
         { // Find corresponding peer by matching socket fd
-            if ( fd == socks_[iPeer] ) break;
+            if ( fd == socks_[indexToPnid_[iPeer]] ) break;
         }
         snprintf( buf, sizeof(buf), "[%s@%d] epoll_ctl(efd=%d,%s, fd=%d(%s), %s) error: %s\n"
                 , method_name, __LINE__
                 , efd
                 , EpollOpString(op)
-                , fd, Node[iPeer]->GetName()
+                , fd, Node[indexToPnid_[iPeer]]->GetName()
                 , EpollEventString(event->events)
                 , strerror_r( errno, ebuff, 256 ) );
         mon_log_write( MON_CLUSTER_EPOLLCTL_1, SQ_LOG_CRIT, buf );
-        MPI_Abort( MPI_COMM_SELF,99 );
+
+        mon_failure_exit();
     }
 
     TRACE_EXIT;
@@ -8668,15 +8636,15 @@
     if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
     {
         int iPeer;
-        for ( iPeer = 0; iPeer < GetConfigPNodesMax(); iPeer++ )
+        for ( iPeer = 0; iPeer < GetConfigPNodesCount(); iPeer++ )
         { // Find corresponding peer by matching socket fd
-            if ( fd == socks_[iPeer] ) break;
+            if ( fd == socks_[indexToPnid_[iPeer]] ) break;
         }
         trace_printf( "%s@%d epoll_ctl( efd=%d,%s, fd=%d(%s), %s )\n"
                     , method_name, __LINE__
                     , efd
                     , EpollOpString(EPOLL_CTL_DEL)
-                    , fd, Node[iPeer]->GetName()
+                    , fd, Node[indexToPnid_[iPeer]]->GetName()
                     , EpollEventString(event->events) );
     }
 
@@ -8697,7 +8665,8 @@
                     , EpollEventString(event->events)
                     , strerror_r( err, ebuff, 256 ) );
             mon_log_write( MON_CLUSTER_EPOLLCTLDELETE_1, SQ_LOG_CRIT, buf );
-            MPI_Abort( MPI_COMM_SELF,99 );
+
+            mon_failure_exit();
         }
     }
 
@@ -8730,7 +8699,8 @@
         snprintf( buf, sizeof(buf), "[%s@%d] MPI_Allgather error=%s\n",
             method_name, __LINE__, ErrorMsg( rc ) );
         mon_log_write( MON_CLUSTER_INITCLUSTERSOCKS_3, SQ_LOG_CRIT, buf );
-        MPI_Abort( MPI_COMM_SELF,99 );
+
+        mon_failure_exit();
     }
 #ifdef NAMESERVER_PROCESS
     if ( !IsRealCluster )
@@ -8761,7 +8731,8 @@
         snprintf( buf, sizeof(buf), "[%s@%d] gethostbyname(%s) error: %s\n",
             method_name, __LINE__, n, strerror_r( h_errno, ebuff, 256 ) );
         mon_log_write( MON_CLUSTER_INITCLUSTERSOCKS_4, SQ_LOG_CRIT, buf );
-        MPI_Abort( MPI_COMM_SELF,99 );
+
+        mon_failure_exit();
     }
     // Initialize my source address structure
     memcpy( srcaddr, he->h_addr, 4 );
@@ -8789,7 +8760,8 @@
                             method_name, __LINE__, n,
                             strerror_r( h_errno, ebuff, 256 ) );
                         mon_log_write( MON_CLUSTER_INITCLUSTERSOCKS_5, SQ_LOG_CRIT, buf );
-                        MPI_Abort( MPI_COMM_SELF,99 );
+
+                        mon_failure_exit();
                     }
                     // Initialize peer's destination address structure
                     memcpy( dstaddr, he->h_addr, 4 );
@@ -8866,7 +8838,8 @@
                         method_name, __LINE__, syncSock_ );
                 }
                 mon_log_write( MON_CLUSTER_INITCLUSTERSOCKS_6, SQ_LOG_CRIT, buf );
-                MPI_Abort( MPI_COMM_SELF,99 );
+
+                mon_failure_exit();
             }
             if ( idst >= 0 && fcntl( socks_[rankToPnid[idst]], F_SETFL, O_NONBLOCK ) )
             {
@@ -8875,7 +8848,8 @@
                 snprintf( buf, sizeof(buf), "[%s@%d] fcntl(NONBLOCK) error: %s\n",
                     method_name, __LINE__, strerror_r( errno, ebuff, 256 ) );
                 mon_log_write( MON_CLUSTER_INITCLUSTERSOCKS_7, SQ_LOG_CRIT, buf );
-                MPI_Abort( MPI_COMM_SELF,99 );
+
+                mon_failure_exit();
             }
             MPI_Barrier( MPI_COMM_WORLD );
         }
@@ -8909,7 +8883,8 @@
                 , method_name, __LINE__
                 , Node_name, strerror_r( h_errno, ebuff, 256 ) );
         mon_log_write( MON_CLUSTER_INITSERVERSOCK_1, SQ_LOG_CRIT, buf );
-        abort();
+
+        mon_failure_exit();
     }
     memcpy( addr, he->h_addr, 4 );
 
@@ -8952,7 +8927,8 @@
                 , method_name, __LINE__, serverCommPort
                 , strerror_r( errno, ebuff, 256 ) );
         mon_log_write( MON_CLUSTER_INITSERVERSOCK_2, SQ_LOG_CRIT, buf );
-        abort();
+
+        mon_failure_exit();
     }
     else
     {
@@ -9014,7 +8990,8 @@
                 , method_name, __LINE__, serverSyncPort
                 , strerror_r( errno, ebuff, 256 ) );
         mon_log_write( MON_CLUSTER_INITSERVERSOCK_3, SQ_LOG_CRIT, buf );
-        abort();
+
+        mon_failure_exit();
     }
     else
     {
@@ -9061,7 +9038,8 @@
                 , method_name, __LINE__, mon2nsPort
                 , strerror_r( errno, ebuff, 256 ) );
         mon_log_write( MON_CLUSTER_INITSERVERSOCK_4, SQ_LOG_CRIT, buf );
-        abort();
+
+        mon_failure_exit();
     }
     else
     {
@@ -9103,7 +9081,8 @@
                    , "[%s@%d] MON2MON_COMM_PORT environment variable is not set!\n"
                    , method_name, __LINE__ );
            mon_log_write( MON_CLUSTER_INITSERVERSOCK_5, SQ_LOG_CRIT, buf );
-           abort();
+
+           mon_failure_exit();
         }
     
         // For virtual env, add PNid to the port so we can still test without collisions of port numbers
@@ -9122,7 +9101,8 @@
                     , method_name, __LINE__, ptpPort
                     , strerror_r( errno, ebuff, MON_STRING_BUF_SIZE ) );
             mon_log_write( MON_CLUSTER_INITSERVERSOCK_6, SQ_LOG_CRIT, buf );
-            abort();
+
+            mon_failure_exit();
         }
         else
         {
@@ -9152,10 +9132,23 @@
     {
         char ebuff[256];
         char buf[MON_STRING_BUF_SIZE];
-        snprintf( buf, sizeof(buf), "[%s@%d] epoll_create1() error: %s\n",
+        snprintf( buf, sizeof(buf), "[%s@%d] epoll_create1(sync) error: %s\n",
             method_name, __LINE__, strerror_r( errno, ebuff, 256 ) );
         mon_log_write( MON_CLUSTER_INITSERVERSOCK_7, SQ_LOG_CRIT, buf );
-        MPI_Abort( MPI_COMM_SELF,99 );
+
+        mon_failure_exit();
+    }
+
+    epollPingFD_ = epoll_create1( EPOLL_CLOEXEC );
+    if ( epollPingFD_ < 0 )
+    {
+        char ebuff[256];
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf), "[%s@%d] epoll_create1(ping) error: %s\n",
+            method_name, __LINE__, strerror_r( errno, ebuff, 256 ) );
+        mon_log_write( MON_CLUSTER_INITSERVERSOCK_5, SQ_LOG_CRIT, buf );
+
+        mon_failure_exit();
     }
 
     TRACE_EXIT;
@@ -9343,7 +9336,8 @@
             sprintf( la_buf, "[%s], socket() failed! errno=%d (%s)\n"
                    , method_name, err, strerror( err ));
             mon_log_write(MON_CLUSTER_CONNECT_1, SQ_LOG_CRIT, la_buf);
-            abort();
+
+            mon_failure_exit();
         }
 
         he = gethostbyname( host );
@@ -9354,7 +9348,8 @@
             snprintf( buf, sizeof(buf), "[%s@%d] gethostbyname(%s) error: %s\n",
                 method_name, __LINE__, host, strerror_r( h_errno, ebuff, 256 ) );
             mon_log_write( MON_CLUSTER_CONNECT_2, SQ_LOG_CRIT, buf );
-            abort();
+
+            mon_failure_exit();
         }
 
         // Connect socket.
@@ -9566,7 +9561,8 @@
         sprintf( la_buf, "[%s], socket() failed! errno=%d (%s)\n"
                , method_name, err, strerror( err ));
         mon_log_write(MON_CLUSTER_CONNECTTOSELF_1, SQ_LOG_CRIT, la_buf);
-        MPI_Abort( MPI_COMM_SELF,99 );
+
+        mon_failure_exit();
     }
 
     he = gethostbyname( "localhost" );
@@ -9577,7 +9573,8 @@
         snprintf( buf, sizeof(buf), "[%s@%d] gethostbyname(%s) error: %s\n",
             method_name, __LINE__, "localhost", strerror_r( h_errno, ebuff, 256 ) );
         mon_log_write( MON_CLUSTER_CONNECTTOSELF_2, SQ_LOG_CRIT, buf );
-        MPI_Abort( MPI_COMM_SELF,99 );
+
+        mon_failure_exit();
     }
 
     // Connect socket.
@@ -9615,7 +9612,8 @@
             sprintf( la_buf, "[%s], connect() failed! errno=%d (%s)\n"
                    , method_name, err, strerror( err ));
             mon_log_write(MON_CLUSTER_CONNECTTOSELF_3, SQ_LOG_CRIT, la_buf);
-            MPI_Abort( MPI_COMM_SELF,99 );
+
+            mon_failure_exit();
         }
     }
 
@@ -9933,7 +9931,7 @@
     {
         char la_buf[MON_STRING_BUF_SIZE];
         int err = errno;
-        sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n"
+        sprintf( la_buf, "[%s], setsockopt(SO_REUSEADDR) failed! errno=%d (%s)\n"
                , method_name, err, strerror( err ));
         mon_log_write(MON_CLUSTER_MKCLTSOCK_6, SQ_LOG_ERR, la_buf);
         close( sock );
@@ -10057,6 +10055,7 @@
     struct sockaddr_in  sockinfo;    // socket address info
 
     size = sizeof(sockinfo);
+    const char * size_srcip = (const char *) srcip;
 
     if ( !retries )
     {
@@ -10075,7 +10074,8 @@
         {
             memset( (char *) &sockinfo, 0, size );
             memcpy( (char *) &sockinfo.sin_addr,
-                (char *) srcip, sizeof(srcip) );
+                (unsigned char *) srcip, strlen(size_srcip));
+
             sockinfo.sin_family = AF_INET;
             sockinfo.sin_port = 0;
             if ( bind( sock, (struct sockaddr *) &sockinfo, size ) )
diff --git a/core/sqf/monitor/linux/cluster.h b/core/sqf/monitor/linux/cluster.h
index 2dfbfb8..2c22fbf 100644
--- a/core/sqf/monitor/linux/cluster.h
+++ b/core/sqf/monitor/linux/cluster.h
@@ -99,6 +99,10 @@
         Reintegrate_Err15   // Could not get connect acknowledgement
     };
 
+    enum { SYNC_DELAY_LOGGING_THRESHOLD = 20 }; // percentage of SQ_MON_SYNC_TIMEOUT (20% default)
+    enum { SYNC_DELAY_LOGGING_THRESHOLD_MAX = 50 }; // maximum percentage of SQ_MON_SYNC_TIMEOUT (50%)
+    enum { SYNC_DELAY_LOGGING_FREQUENCY_DEFAULT = 60 }; // Mininum 1 minute between log messages
+
     int        NumRanks;       // Current # of processes in the cluster
 
     CCluster( void );
@@ -131,8 +135,8 @@
 #ifndef NAMESERVER_PROCESS
     void AssignTmLeader( int pnid, bool checkProcess );
 #endif
-    void AssignMonitorLeader( const char* failedMaster );
-    void UpdateMonitorPort (const char* newMaster);
+    void AssignMonitorLeader( const char * failedMaster );
+    void UpdateMonitorPort (const char * newMaster);
     void stats();
     void CompleteSyncCycle()
         { syncCycle_.lock(); syncCycle_.wait(); syncCycle_.unlock(); }
@@ -148,17 +152,12 @@
     inline void SetTmLeader( int tmLeaderNid ) { tmLeaderNid_ = tmLeaderNid; } 
 #endif
     int  GetDownedNid( void );
-#ifndef NAMESERVER_PROCESS
-    inline int GetTmSyncPNid( void ) { return( tmSyncPNid_ ); } // Physical Node ID of current TmSync operations master
-#endif
     void InitClusterComm(int worldSize, int myRank, int *rankToPnid);
+    void InitializeConfigCluster( int pnid );
     void addNewComm(int nid, int otherRank, MPI_Comm comm);
     void addNewSock(int nid, int otherRank, int sockFd );
 
     bool exchangeNodeData ( );
-#ifndef NAMESERVER_PROCESS
-    void exchangeTmSyncData ( struct sync_def *sync, bool bumpSync );
-#endif
     int GetConfigPNodesCount() { return configPNodesCount_; }
     int GetConfigPNodesMax() { return configPNodesMax_; }
     bool ImAlive( bool needed=false, struct sync_def *sync = NULL );
@@ -168,8 +167,6 @@
 #else
     void HardNodeDownNs( int nid );
 #endif
-    void SoftNodeDown( int pnid );
-    int  SoftNodeUpPrepare( int pnid );
     bool CheckSpareSet( int pnid );
     inline bool  IsIntegrating( void ) { return( (joinSock_ != -1 || joinComm_ != MPI_COMM_NULL) || integratingPNid_ != -1 ); }
     struct message_def *JoinMessage( const char *node_name, int pnid, JOINING_PHASE phase );
@@ -234,6 +231,7 @@
         long fullSize_;
         long compressedSize_;
         long tmLeader_;
+        long verifiers_[3];
         long clusterRegistryCount_;
         long processRegistryCount_;
         long uniqueStringCount_;
@@ -254,25 +252,15 @@
     int            ptpSock_;
 #endif
     int            epollFD_;
+    int            epollPingFD_;
     int           *indexToPnid_;
     int            configMaster_;
 
     CNode  **Node;           // array of nodes
     CLNode **LNode;          // array of logical nodes
-    int      tmSyncPNid_;    // Physical Node ID of current TmSync operations master
 
-
-#ifndef NAMESERVER_PROCESS
-    void AddTmsyncMsg( struct sync_buffer_def *tmSyncBuffer
-                     , struct sync_def *sync
-                     , struct internal_msg_def *msg);
-#endif
     void AddReplData (struct internal_msg_def *msg);
     void AddMyNodeState ();
-#ifndef NAMESERVER_PROCESS
-    void TraceTMSyncState(struct sync_buffer_def *recv_buffer,
-                          size_t recvCount);
-#endif
     void UpdateAllNodeState(struct sync_buffer_def *recv_buffer,
                             size_t recvCount,
                             bool &overflow);
@@ -330,7 +318,10 @@
     unsigned long long highSeqNum_;
     unsigned long long reconnectSeqNum_;
     unsigned long long seqNum_;
-    int cumulativeDelaySec_;
+    int cumulativeSyncDelay_;        // cumulative seconds that Allgather is stuck
+                                     // this is the basis of log events generated
+    int syncDelayLogEventInterval_;  // when subsequent log events are generated (seconds)
+    int syncDelayLogEventThreshold_; // when 1st log event is generated (seconds)
 
     bool waitForWatchdogExit_;    // set when watchdog exit has already been issued
     bool waitForNameServerExit_;  // set when Name Server exit has already been issued
@@ -405,9 +396,10 @@
     int Allgather(int nbytes, void *sbuf, char *rbuf, int tag, MPI_Status *stats);
     int AllgatherIB(int nbytes, void *sbuf, char *rbuf, int tag, MPI_Status *stats);
     int AllgatherSock(int nbytes, void *sbuf, char *rbuf, int tag, MPI_Status *stats);
-    int AllgatherSockReconnect( MPI_Status *stats, bool reestablishConnections = false );
-    int AcceptSockPeer( CNode *node, int peer, bool reestablishConnections = false );
-    int ConnectSockPeer( CNode *node, int peer, bool reestablishConnections = false );
+    int AllgatherSockReconnect( MPI_Status *stats, peer_t *peers, bool resetConnections = false );
+    int AcceptSockPeer( MPI_Status *stats, bool resetConnections = false );
+    int CheckSockPeer( int pnid, MPI_Status *stats, peer_t *peer );
+    int ConnectSockPeer( CNode *node, int peer, bool resetConnections = false );
 
     void ValidateClusterState( cluster_state_def_t nodestate[],
                                bool haveDivergence );
@@ -415,8 +407,8 @@
     void HandleDownNode( int nid );
     void HandleReintegrateError( int rc, int err,
                                  int nid, nodeId_t *nodeInfo,
-                                 bool abort );
-    bool PingSockPeer(CNode *node);
+                                 bool abort=false );
+    bool PingSockPeer( CNode *node, struct timespec &peerZnodeFailTime );
     void ReIntegrateMPI( int initProblem );
     void ReIntegrateSock( int initProblem );
     void SendReIntegrateStatus( STATE nodeState, int status );
@@ -426,8 +418,7 @@
                              MPI_Status *status,
                              int sentChangeNid);
     bool ProcessClusterData( struct sync_buffer_def * syncBuf,
-                             struct sync_buffer_def * sendBuf,
-                             bool deferredTmSync );
+                             struct sync_buffer_def * sendBuf );
 
     bool checkIfDone ( void );
     void setNewComm(int nid);
diff --git a/core/sqf/monitor/linux/commaccept.cxx b/core/sqf/monitor/linux/commaccept.cxx
index 13c2ebd..b925e9b 100644
--- a/core/sqf/monitor/linux/commaccept.cxx
+++ b/core/sqf/monitor/linux/commaccept.cxx
@@ -547,6 +547,7 @@
         snprintf(buf, sizeof(buf), "[%s], unable to obtain node id from new "
                  "monitor: %s.\n", method_name, ErrorMsg(rc));
         mon_log_write(MON_COMMACCEPT_8, SQ_LOG_ERR, buf);    
+        CommAccept.startAccepting();
         return;
     }
 
@@ -668,6 +669,9 @@
 
         close( joinFd );
 
+        // This reply will terminate the other monitor
+        node->SetState(State_Down);
+
         char buf[MON_STRING_BUF_SIZE];
         snprintf( buf, sizeof(buf)
                 , "[%s], got connection from node %s (pnid=%d). "
@@ -892,6 +896,13 @@
 
         if (nodeStatus.state == State_Up)
         {
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d - Received reintegrate status: state=%s, error=%d\n"
+                            , method_name, __LINE__
+                            , StateString(nodeStatus.state)
+                            , nodeStatus.status );
+            }
             // communicate the change and handle it after sync
             // in ImAlive
             node->SetChangeState( true );
diff --git a/core/sqf/monitor/linux/config.cxx b/core/sqf/monitor/linux/config.cxx
index 9794199..4749bd3 100644
--- a/core/sqf/monitor/linux/config.cxx
+++ b/core/sqf/monitor/linux/config.cxx
@@ -556,6 +556,11 @@
     rc = tc_get_registry_cluster_set( &regClusterCount
                                     , regClusterMax
                                     , NULL );
+    if (trace_settings & (TRACE_INIT))
+      trace_printf("%s%d - rc:%d, #cluster scope keys:%d\n",
+                   method_name, __LINE__, 
+                   rc,
+                   regClusterCount);
     if ( rc )
     {
         char la_buf[MON_STRING_BUF_SIZE];
@@ -603,9 +608,14 @@
     }
 
     // Get process scope configuration registry entries count
-    rc = tc_get_registry_cluster_set( &regProcessCount
+    rc = tc_get_registry_process_set( &regProcessCount
                                     , regProcessMax
                                     , NULL );
+    if (trace_settings & (TRACE_INIT))
+      trace_printf("%s%d - rc:%d, #process scope keys:%d\n",
+                   method_name, __LINE__, 
+                   rc,
+                   regProcessCount);
     if ( rc )
     {
         char la_buf[MON_STRING_BUF_SIZE];
@@ -619,7 +629,7 @@
     regProcessConfig = new TcRegistryConfiguration_t[regProcessMax];
     if (regClusterConfig)
     {
-        rc = tc_get_registry_cluster_set( &regProcessCount
+        rc = tc_get_registry_process_set( &regProcessCount
                                         , regProcessMax
                                         , regProcessConfig );
         if ( rc )
@@ -882,47 +892,6 @@
     TRACE_EXIT;
 }
 
-bool CConfigContainer::findUniqueString( int         nid
-                                       , const char *uniqStr
-                                       , strId_t    &strId )
-{
-    const char method_name[] = "CConfigContainer::findUniqueString";
-    TRACE_ENTRY;
-
-    bool result = false;
-    int rc;
-    int id;
-
-    if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
-    {
-        trace_printf( "%s@%d finding unique string nid=%d string=%s\n"
-                    , method_name, __LINE__
-                    , nid, uniqStr );
-    }
-
-    rc = tc_get_unique_string_id( nid, uniqStr, &id );
-    if ( rc )
-    {
-        if ( rc != TCDBNOEXIST )
-        {
-            char buf[MON_STRING_BUF_SIZE];
-            snprintf( buf, sizeof(buf)
-                    , "[%s] tc_get_unique_string_id() failed, ""error=%d (%s)\n"
-                    , method_name, rc, tc_errmsg( rc ) );
-            mon_log_write( MON_CONFIGCONT_FINDUNIQUESTRING_1, SQ_LOG_ERR, buf );
-        }
-    }
-    else
-    {
-        strId.nid = nid;
-        strId.id = id;
-        result = true;
-    }
-
-    TRACE_EXIT;
-    return result;
-}
-
 int CConfigContainer::getMaxUniqueId( int nid )
 {
     const char method_name[] = "CConfigContainer::getMaxUniqueId";
@@ -963,6 +932,92 @@
     return id;
 }
 
+bool CConfigContainer::getUniqueString(int nid, int id, string & uniqStr )
+{
+    const char method_name[] = "CConfigContainer::getUniqueString";
+    TRACE_ENTRY;
+
+    bool result = false;
+    int rc;
+    char uniqueString[TC_UNIQUE_STRING_VALUE_MAX] = { 0 };
+
+    if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
+    {
+        trace_printf( "%s@%d Get unique string, stringId(nid=%d, id=%d)\n"
+                    , method_name, __LINE__
+                    , nid, id );
+    }
+
+    rc = tc_get_unique_string( nid, id, uniqueString );
+    if ( rc )
+    {
+        uniqStr.assign( "" );
+        if ( rc != TCDBNOEXIST )
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s] tc_get_unique_string() failed, ""error=%d (%s)\n"
+                    , method_name, rc, tc_errmsg( rc ) );
+            mon_log_write( MON_CONFIGCONT_GETUNIQUESTRING_1, SQ_LOG_ERR, buf );
+        }
+    }
+    else
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
+        {
+            trace_printf( "%s@%d Found unique string, stringId(nid=%d, id=%d), string=%s\n"
+                        , method_name, __LINE__
+                        , nid, id, uniqueString );
+        }
+        uniqStr.assign( uniqueString  );
+        result = true;
+    }
+
+    TRACE_EXIT;
+    return result;
+}
+
+bool CConfigContainer::getUniqueStringId( int         nid
+                                        , const char *uniqStr
+                                        , strId_t    &strId )
+{
+    const char method_name[] = "CConfigContainer::getUniqueStringId";
+    TRACE_ENTRY;
+
+    bool result = false;
+    int rc;
+    int id;
+
+    if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
+    {
+        trace_printf( "%s@%d finding unique string nid=%d string=%s\n"
+                    , method_name, __LINE__
+                    , nid, uniqStr );
+    }
+
+    rc = tc_get_unique_string_id( nid, uniqStr, &id );
+    if ( rc )
+    {
+        if ( rc != TCDBNOEXIST )
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s] tc_get_unique_string_id() failed, ""error=%d (%s)\n"
+                    , method_name, rc, tc_errmsg( rc ) );
+            mon_log_write( MON_CONFIGCONT_GETUNIQUESTRINGID_1, SQ_LOG_ERR, buf );
+        }
+    }
+    else
+    {
+        strId.nid = nid;
+        strId.id = id;
+        result = true;
+    }
+
+    TRACE_EXIT;
+    return result;
+}
+
 void CConfigContainer::strIdToString( strId_t stringId,  string & value )
 {
     const char method_name[] = "CConfigContainer::strIdToString";
@@ -971,7 +1026,7 @@
     int rc;
     char uniqueString[TC_UNIQUE_STRING_VALUE_MAX] = { 0 };
 
-    if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
+    if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
     {
         trace_printf( "%s@%d Get unique string, stringId(nid=%d, id=%d)\n"
                     , method_name, __LINE__
@@ -988,7 +1043,7 @@
             snprintf( buf, sizeof(buf)
                     , "[%s] tc_get_unique_string() failed, ""error=%d (%s)\n"
                     , method_name, rc, tc_errmsg( rc ) );
-            mon_log_write( MON_CONFIGCONT_STRINGIDTPSTRING_1, SQ_LOG_ERR, buf );
+            mon_log_write( MON_CONFIGCONT_STRINGIDTOSTRING_1, SQ_LOG_ERR, buf );
         }
     }
     else
@@ -1067,9 +1122,9 @@
     int totalUniqueIds = 0;
     int uniqueIdSize = 0;
     
-    for (int pnid = 0; pnid < Monitor->GetConfigPNodesMax(); pnid++)
+    for (int index = 0; index < Nodes->GetPNodesCount(); index++)
     {
-        tc_get_unique_string_id_max (pnid, &maxUniqueId);
+        tc_get_unique_string_id_max( Nodes->GetPNidByMap( index ), &maxUniqueId );
         totalUniqueIds += maxUniqueId;
     }
     
@@ -1103,8 +1158,8 @@
       {
           // Get cluster scope configuration registry entries count
           rc = tc_get_registry_cluster_set( &regClusterCount
-                                    , regClusterMax
-                                    , NULL );
+                                          , regClusterMax
+                                          , NULL );
           if ( rc )
           {
               char la_buf[MON_STRING_BUF_SIZE];
@@ -1119,8 +1174,8 @@
           regClusterMax = regClusterCount;
           regClusterConfig = new TcRegistryConfiguration_t[regClusterMax];
           rc = tc_get_registry_cluster_set( &regClusterCount
-                                              , regClusterMax
-                                              , regClusterConfig );
+                                          , regClusterMax
+                                          , regClusterConfig );
           if ( rc )
           {
               char la_buf[MON_STRING_BUF_SIZE];
@@ -1173,7 +1228,7 @@
            // programming error
           char la_buf[MON_STRING_BUF_SIZE];
           snprintf( la_buf, sizeof(la_buf)
-                    , "[%s]Rregistry access failed!\n"
+                    , "[%s]Registry access failed!\n"
                     , method_name );
           mon_log_write(MON_CONFIGCONT_INIT_2, SQ_LOG_CRIT, la_buf);
           TRACE_EXIT;
@@ -1193,7 +1248,7 @@
         regClusterEntry->valueLength = strlen (regClusterConfig[i].value);
         if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
         {
-            trace_printf ("%s@%d pack type %d, scope %s (%d), key %s (%d), value %s(%d)\n",method_name, __LINE__,
+            trace_printf ("%s@%d - pack type %d, scope %s (%d), key %s (%d), value %s(%d)\n",method_name, __LINE__,
                            regClusterEntry->type, regClusterConfig[i].scope, 
                            regClusterEntry->scopeLength,regClusterConfig[i].key,regClusterEntry->keyLength,  
                            regClusterConfig[i].value, regClusterEntry->valueLength);
@@ -1228,6 +1283,15 @@
     {
          delete [] regClusterConfig; 
     }
+
+    if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
+    {
+        trace_printf( "%s@%d - Packed Registry, scope(%s), entries=%d\n"
+                    , method_name, __LINE__
+                    , (type == ConfigType_Cluster) ? "CLUSTER" : "PROCESS"
+                    , numberOfEntries );
+    } 
+
     return numberOfEntries;
     
     TRACE_EXIT;
@@ -1243,25 +1307,27 @@
      char myValue [TC_REGISTRY_VALUE_MAX];
      char myKey[TC_REGISTRY_KEY_MAX];
 
+     if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
+     {
+          trace_printf( "%s@%d - Unpacking Registry, entries=%d\n"
+                      , method_name, __LINE__
+                      , count );
+     } 
+ 
      if (count <= 0)
      {
          TRACE_EXIT;
          return;
      }
+
      struct cluster_set *clusterObj2 =  (cluster_set *)buffer;
      char *stringData2 = &clusterObj2->stringData;
      for (int i = 0; i < count; i++ )
      {     
         memset (myScope, '\0', sizeof (myScope));
-        memset (myValue, '\0', sizeof (myValue));
         memset (myKey, '\0', sizeof (myKey));
+        memset (myValue, '\0', sizeof (myValue));
 
-        if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
-        {
-            trace_printf ("%s@%d scope length %d, key length %d, value length %d\n", method_name, __LINE__,
-                          clusterObj2->scopeLength, 
-                          clusterObj2->keyLength, clusterObj2->valueLength);
-        }
         type = clusterObj2->type;
 
         memcpy(myScope, stringData2,  clusterObj2->scopeLength);
@@ -1273,6 +1339,24 @@
         memcpy(myValue, stringData2,  clusterObj2->valueLength);
         stringData2 += clusterObj2->valueLength;
 
+        if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
+        {
+            trace_printf( "%s@%d - scope=%s(%d), key=%s(%d), value=%s(%d)\n"
+                        , method_name, __LINE__
+                        , myScope
+                        , clusterObj2->scopeLength
+                        , myKey
+                        , clusterObj2->keyLength
+                        , myValue
+                        , clusterObj2->valueLength);
+        }
+
+        Set( myScope
+           , type
+           , myKey
+           , myValue
+           , true );
+
         if ((i+1) < count)
         {
             clusterObj2 = (cluster_set *)stringData2;
@@ -1280,12 +1364,6 @@
         }
     } 
     
-    Set( myScope
-       , type
-       , myValue
-       , myKey
-       , false );
-    
     buffer = stringData2;
     TRACE_EXIT; 
 }
@@ -1296,37 +1374,39 @@
     TRACE_ENTRY;
     int maxUniqueId = 0;
     char unique_string[TC_UNIQUE_STRING_VALUE_MAX] = { 0 };
-    int  stringDataLen;
     char *bufPtr = buffer;
 
     struct unique_string_set *stringObj = (struct unique_string_set *)bufPtr;
     char *stringData = &stringObj->stringData;
     int numberOfEntries = 0;
 
-    for (int pnid = 0; pnid < Monitor->GetConfigPNodesMax(); pnid++)
+    for (int index = 0; index < Nodes->GetPNodesCount(); index++)
     {
-        tc_get_unique_string_id_max (pnid, &maxUniqueId);
+        int nid = Nodes->GetPNidByMap( index );
+        tc_get_unique_string_id_max( nid, &maxUniqueId );
 
         for (int maxId = 0; maxId <= maxUniqueId; maxId++)
         {
              memset (unique_string, 0, TC_UNIQUE_STRING_VALUE_MAX);
-             int error = tc_get_unique_string( pnid, maxId, unique_string );
+             int error = tc_get_unique_string( nid, maxId, unique_string );
 
              if (!error)
              {
                  stringObj->stringLength = strlen(unique_string);
                  if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
                  {
-                      trace_printf ("%s@%d  packing nid %d, unique id %d, stringt %s (length %d)\n", method_name, __LINE__,
-                                     pnid, maxId, unique_string,stringObj->stringLength );
+                      trace_printf( "%s@%d - [%d] Packing Unique String, nid=%d, "
+                                    "unique_id=%d, string=%s (length=%d)\n"
+                                  , method_name, __LINE__
+                                  , numberOfEntries, nid, maxId
+                                  , unique_string, stringObj->stringLength );
                  } 
                  stringObj->unique_id = maxId;
-                 stringObj->nid = pnid;
+                 stringObj->nid = nid;
                  memcpy (stringData, unique_string, stringObj->stringLength);
-                 stringDataLen+= stringObj->stringLength;
                  stringData+=stringObj->stringLength;
                  ++numberOfEntries;
-                 if ((pnid + 1) < Monitor->GetConfigPNodesMax())
+                 if (index < Nodes->GetPNodesCount())
                  {
                       stringObj = ( unique_string_set *)stringData;
                       stringData = &stringObj->stringData;
@@ -1334,9 +1414,15 @@
              }
              // don't advance if we didn't write anything
         }
-
     }
     
+    if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
+    {
+         trace_printf( "%s@%d - Packed Unique Strings, entries=%d\n"
+                     , method_name, __LINE__
+                     , numberOfEntries );
+    } 
+
     if (numberOfEntries > 0)
     {
         buffer = stringData;    
@@ -1350,36 +1436,73 @@
 {
     const char method_name[] = "CConfigContainer::UnpackUniqueStrings";
     TRACE_ENTRY;
-    
+
+    if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
+    {
+         trace_printf( "%s@%d - Unpacking Unique Strings, entries=%d\n"
+                     , method_name, __LINE__
+                     , entries );
+    } 
+
     if (entries <= 0)
-    { 
+    {
          TRACE_EXIT;
          return;
     }
-     
+
     struct unique_string_set *stringObj =  (unique_string_set *)buffer;
     char *stringData = &stringObj->stringData;
     char unique_string[TC_UNIQUE_STRING_VALUE_MAX] = { 0 };
-    
+    int nid = -1;
+    int unique_id = -1;
+    strId_t id;
+
     for (int i = 0; i < entries; i++)
     {
         if (stringObj)
         {
-            int maxUniqueId;
-            tc_get_unique_string_id_max (stringObj->nid, &maxUniqueId);
-            memset(unique_string, 0, TC_UNIQUE_STRING_VALUE_MAX);
-            memcpy (unique_string, stringData,stringObj->stringLength);
-            if (stringObj->unique_id > maxUniqueId)
+            if (nid != stringObj->nid)
             {
-                tc_put_unique_string( stringObj->nid, stringObj->unique_id, unique_string );
+                nid = stringObj->nid;
+            }
+
+            unique_id = stringObj->unique_id;
+            memset( unique_string, 0, TC_UNIQUE_STRING_VALUE_MAX );
+            memcpy( unique_string, stringData,stringObj->stringLength );
+            if ( ! Config->getUniqueStringId( nid, unique_string, id ) )
+            {   // The string is not in the configuration database, add it
+                id.id  = unique_id;
+                id.nid = nid;
+        
+                if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
+                {
+                     trace_printf( "%s@%d - [%d] Adding Unique String, nid=%d, "
+                                   "unique_id=%d, string=%s (length=%d)\n"
+                                 , method_name, __LINE__, i
+                                 , nid, unique_id, unique_string
+                                 , stringObj->stringLength );
+                } 
+
+                Config->addUniqueString( id.nid, id.id, unique_string );
+            }
+            else
+            {
+                if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
+                {
+                     trace_printf( "%s@%d - [%d] Unique String exists, nid=%d, "
+                                   "unique_id=%d, string=%s (length=%d)\n"
+                                 , method_name, __LINE__, i
+                                 , nid, unique_id, unique_string
+                                 , stringObj->stringLength );
+                } 
             }
 
             stringData += stringObj->stringLength;
-        }
-        if ((i + 1) < entries)
-        {
-             stringObj = (unique_string_set *)stringData;
-             stringData = &stringObj->stringData;
+            if (i  < entries)
+            {
+                 stringObj = (unique_string_set *)stringData;
+                 stringData = &stringObj->stringData;
+            }
         }
     }
 
diff --git a/core/sqf/monitor/linux/config.h b/core/sqf/monitor/linux/config.h
index 80b1089..21d6f1f 100644
--- a/core/sqf/monitor/linux/config.h
+++ b/core/sqf/monitor/linux/config.h
@@ -140,8 +140,9 @@
     void addDbProcData ( const char * procName, const char * key, const char * dataValue );
     void addDbClusterData ( const char * key, const char * dataValue );
     void addUniqueString(int nid, int id, const char * uniqStr );
-    bool findUniqueString(int nid, const char * uniqStr, strId_t & id );
     int  getMaxUniqueId( int nid );
+    bool getUniqueString(int nid, int id, string & uniqStr );
+    bool getUniqueStringId(int nid, const char * uniqStr, strId_t & id );
     void strIdToString ( strId_t stringId, string & value );
 
     int    PackRegistry( char *&buffer, ConfigType type );
diff --git a/core/sqf/monitor/linux/device.cxx b/core/sqf/monitor/linux/device.cxx
index ac73244..5f9477a 100644
--- a/core/sqf/monitor/linux/device.cxx
+++ b/core/sqf/monitor/linux/device.cxx
@@ -57,6 +57,7 @@
 #define LUNMGR_STATE_MOUNT_ERROR_MOUNT_FAILED         25
 #define LUNMGR_STATE_MOUNT_ERROR_SOME_SUCCEEDED       26
 
+extern bool NameServerEnabled;
 extern int MyPNID;
 extern CMonitor *Monitor;
 extern CNodeContainer *Nodes;
@@ -447,9 +448,36 @@
 
     if ( rstate && replicate )
     {
-        // Replicate the mount to other nodes
-        CReplDevice *repl = new CReplDevice(this);
-        Replicator.addItem(repl);
+        if ( NameServerEnabled )
+        {
+            int rc = -1;
+
+#if 0
+            rc = PtpClient->ProcessDevice(this); // TODO: in future
+#endif
+            if (rc)
+            {
+                char la_buf[MON_STRING_BUF_SIZE];
+                snprintf( la_buf, sizeof(la_buf)
+                        , "[%s] - Logical device mount request not supported: "
+                          "device %s (%d, %d): "
+                          "zone=%d, primaryZid_=%d, backupZid_=%d\n"
+                        , method_name
+                        , process->GetName()
+                        , process->GetNid()
+                        , process->GetPid()
+                        , zone
+                        , primaryZid_
+                        , backupZid_  );
+                mon_log_write(MON_LDEVICE_MOUNT_1, SQ_LOG_ERR, la_buf);
+            }
+        }
+        else
+        {
+            // Replicate the mount to other nodes
+            CReplDevice *repl = new CReplDevice(this);
+            Replicator.addItem(repl);
+        }
     }
 
     TRACE_EXIT;
@@ -522,9 +550,37 @@
     rstate = (pstate && bstate);
     if ( replicate )
     {
-        // Replicate the mount to other nodes
-        CReplDevice *repl = new CReplDevice(this);
-        Replicator.addItem(repl);
+        if ( NameServerEnabled )
+        {
+            int rc = -1;
+            CProcess *process = NULL;
+            Nodes->GetNode( (char *) GetName(), &process );
+
+#if 0
+            rc = PtpClient->ProcessDevice(this); // TODO: in future
+#endif
+            if (rc)
+            {
+                char la_buf[MON_STRING_BUF_SIZE];
+                snprintf( la_buf, sizeof(la_buf)
+                        , "[%s] - Logical device unmount request not supported: "
+                          "device %s (%d, %d), "
+                          "primary=%d, mirror=%d\n"
+                        , method_name
+                        , process?process->GetName():""
+                        , process?process->GetNid():-1
+                        , process?process->GetPid():-1
+                        , pstate
+                        , bstate  );
+                mon_log_write(MON_LDEVICE_UNMOUNT_1, SQ_LOG_ERR, la_buf);
+            }
+        }
+        else
+        {
+            // Replicate the mount to other nodes
+            CReplDevice *repl = new CReplDevice(this);
+            Replicator.addItem(repl);
+        }
     }
 
     TRACE_EXIT;
@@ -727,9 +783,34 @@
         ldev = ConfigDevice (process);
         if (ldev)
         {
-            // Replicate the device to other nodes
-            CReplDevice *repl = new CReplDevice(ldev);
-            Replicator.addItem(repl);
+            if ( NameServerEnabled )
+            {
+                int rc = -1;
+                CProcess *process = NULL;
+                Nodes->GetNode( (char *) ldev->GetName(), &process );
+    
+#if 0
+                rc = PtpClient->ProcessDevice(this); // TODO: in future
+#endif
+                if (rc)
+                {
+                    char la_buf[MON_STRING_BUF_SIZE];
+                    snprintf( la_buf, sizeof(la_buf)
+                            , "[%s] - Create logical device request not supported: "
+                              "device %s (%d, %d)\n"
+                            , method_name
+                            , process?process->GetName():""
+                            , process?process->GetNid():-1
+                            , process?process->GetPid():-1 );
+                    mon_log_write(MON_LDEVICE_CREATEDEVICE_1, SQ_LOG_ERR, la_buf);
+                }
+            }
+            else
+            {
+                // Replicate the device to other nodes
+                CReplDevice *repl = new CReplDevice(ldev);
+                Replicator.addItem(repl);
+            }
         }
         else
         {
diff --git a/core/sqf/monitor/linux/healthcheck.cxx b/core/sqf/monitor/linux/healthcheck.cxx
index 203465c..50aeca7 100644
--- a/core/sqf/monitor/linux/healthcheck.cxx
+++ b/core/sqf/monitor/linux/healthcheck.cxx
@@ -53,17 +53,75 @@
 #include "process.h"
 #include "redirector.h"
 #include "replicate.h"
+#include "zclient.h"
 
 extern CReqQueue ReqQueue;
 extern CMonitor *Monitor;
+extern CNodeContainer *Nodes;
 extern CNode *MyNode;
 #ifndef NAMESERVER_PROCESS
 extern CRedirector Redirector;
 #endif
 extern CHealthCheck HealthCheck;
 extern CReplicate Replicator;
+extern CZClient    *ZClient;
 extern int MyPNID;
 extern bool IsRealCluster;
+extern bool ZClientEnabled;
+extern char MasterMonitorName[MAX_PROCESS_PATH];
+
+const char *StateString( STATE state);
+
+const char *HealthCheckStateString( HealthCheckStates state)
+{
+    const char *str;
+    
+    switch( state )
+    {
+        case HC_AVAILABLE:
+            str = "HC_AVAILABLE";
+            break;
+        case HC_UPDATE_SMSERVICE:
+            str = "HC_UPDATE_SMSERVICE";
+            break;
+        case HC_UPDATE_WATCHDOG:
+            str = "HC_UPDATE_WATCHDOG";
+            break;
+        case MON_READY:
+            str = "MON_READY";
+            break;
+        case MON_SHUT_DOWN:
+            str = "MON_SHUT_DOWN";
+            break;
+        case MON_NODE_QUIESCE:
+            str = "MON_NODE_QUIESCE";
+            break;
+        case MON_SCHED_NODE_DOWN:
+            str = "MON_SCHED_NODE_DOWN";
+            break;
+        case MON_NODE_DOWN:
+            str = "MON_NODE_DOWN";
+            break;
+        case MON_STOP_WATCHDOG:
+            str = "MON_STOP_WATCHDOG";
+            break;
+        case MON_START_WATCHDOG:
+            str = "MON_START_WATCHDOG";
+            break;
+        case MON_EXIT_PRIMITIVES:
+            str = "MON_EXIT_PRIMITIVES";
+            break;
+        case HC_EXIT:
+            str = "HC_EXIT";
+            break;
+        default:
+            str = "HealthCheckState - Undefined";
+            break;
+    }
+
+    return( str );
+}
+
 
 // constructor
 CHealthCheck::CHealthCheck()
@@ -75,6 +133,7 @@
     clock_gettime(CLOCK_REALTIME, &currTime_);
     lastReqCheckTime_ = currTime_;
     lastSyncCheckTime_ = currTime_;
+    nextHealthLogTime_ = currTime_;
     quiesceStartTime_.tv_sec = 0;
     quiesceStartTime_.tv_nsec = 0;
     nonresponsiveTime_.tv_sec = 0;
@@ -87,7 +146,7 @@
 
     initializeVars();
 
-    monSyncTimeout_ = -1;
+    monSyncTimeout_ = CHealthCheck::SYNC_TIMEOUT_DEFAULT;
     char *monSyncTimeoutC = getenv("SQ_MON_SYNC_TIMEOUT");
     if ( monSyncTimeoutC ) 
     {
@@ -115,6 +174,18 @@
         quiesceTimeoutSec_     = atoi(quiesceTimeoutC);
     }
 
+    healthLoggingFrequency_    = CHealthCheck::HEALTH_LOGGING_FREQUENCY_DEFAULT;
+    char *healthLoggingFrequencyC = getenv("SQ_HEALTH_LOGGING_FREQUENCY");
+    if (healthLoggingFrequencyC)
+    {
+        healthLoggingFrequency_     = atoi(healthLoggingFrequencyC);
+        if (healthLoggingFrequency_ < CHealthCheck::HEALTH_LOGGING_FREQUENCY_MIN)
+        {
+            healthLoggingFrequency_ = CHealthCheck::HEALTH_LOGGING_FREQUENCY_MIN;
+        }
+    }
+    nextHealthLogTime_.tv_sec += healthLoggingFrequency_;
+    
 #ifdef NAMESERVER_PROCESS
     cpuSchedulingDataEnabled_ = false;
 #else
@@ -210,12 +281,8 @@
     char buf[MON_STRING_BUF_SIZE];
     sprintf(buf, "[%s], stopping.\n", method_name);
     mon_log_write(MON_HEALTHCHECK_STOP_NS_1, SQ_LOG_CRIT, buf);
-    // Don't generate a core file, abort is intentional
-    struct rlimit limit;
-    limit.rlim_cur = 0;
-    limit.rlim_max = 0;
-    setrlimit(RLIMIT_CORE, &limit);
-    abort();
+
+    mon_failure_exit();
 }
 #endif
 
@@ -229,6 +296,9 @@
     TRACE_ENTRY;
 
     HealthCheckStates state;
+#ifndef NAMESERVER_PROCESS
+    int myPid = getpid();
+#endif
     struct timespec ts;
 
     if (trace_settings & TRACE_HEALTH)
@@ -238,6 +308,17 @@
 
     bool done = false;
 
+    if (trace_settings & (TRACE_HEALTH | TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d quiesceTimeoutSec_ = %d, syncTimeoutSec_ = %d, "
+                      "workerTimeoutSec_ = %d, healthLoggingFrequency_= %d (secs)\n"
+                    , method_name, __LINE__
+                    , quiesceTimeoutSec_
+                    , monSyncTimeout_
+                    , CReqQueue::REQ_MAX_RESPONSIVE
+                    , healthLoggingFrequency_ );
+    }
+
     // Wait for event or timer to expire
     while (!done) 
     {
@@ -255,6 +336,8 @@
 
 #ifndef NAMESERVER_PROCESS
 #ifdef EXCHANGE_CPU_SCHEDULING_DATA
+        timeToLogHealth( currTime_ );
+
         if (cpuSchedulingDataEnabled_)
         {
             // Replicate this host's CPU scheduling data to other nodes
@@ -265,9 +348,10 @@
 #endif
 
         state = state_;
-
+#if 0
         if ( trace_settings & TRACE_HEALTH )
-            trace_printf("%s@%d State: %d(%s)\n", method_name, __LINE__, state, getStateStr(state));
+            trace_printf("%s@%d State: %s\n", method_name, __LINE__, HealthCheckStateString(state));
+#endif
 
         switch(state)
         {
@@ -411,6 +495,12 @@
         trace_printf("%s@%d sigusr2 signal triggered, work queue is now blocked.\n", 
                       method_name, __LINE__);
 
+    if ( ZClientEnabled )
+    {
+        ZClient->RunningZNodeDelete( MyNode->GetName() );
+        ZClient->MasterZNodeDelete( MyNode->GetName() );
+    }
+
     char buf[MON_STRING_BUF_SIZE];
     sprintf(buf, "[CHealthCheck::sigusr2SignalHandler], work queue to now blocked.\n");
     mon_log_write(MON_HEALTHCHECK_Q_BLOCK, SQ_LOG_CRIT, buf);
@@ -568,7 +658,13 @@
 {
     const char method_name[] = "CHealthCheck::processTimerEvent";
     TRACE_ENTRY;
-
+#if 0
+    if (trace_settings & TRACE_HEALTH)
+    {
+        trace_printf("%s@%d Timer event: wakeup=%lld, current=%ld\n"
+                    , method_name, __LINE__, wakeupTimeSaved_, currTime_.tv_sec );
+    }
+#endif
     // check if request queue is responsive, once every REQ_MAX_RESPONSIVE secs
     if ( (currTime_.tv_sec - lastReqCheckTime_.tv_sec) > CReqQueue::REQ_MAX_RESPONSIVE )
     {
@@ -607,15 +703,27 @@
             {
                 if ( (currTime_.tv_sec - nonresponsiveTime_.tv_sec) > monSyncTimeout_ )
                 {   
-                    ReqQueue.enqueueDownReq(MyPNID);
+                    char buf[MON_STRING_BUF_SIZE];
+                    sprintf( buf
+                           , "[%s], Sync Thread Timeout detected (timeout=%d). "
+                             "Scheduling all nodes down!\n"
+                           , method_name, monSyncTimeout_);
+                    mon_log_write(MON_HEALTHCHECK_TEVENT_4, SQ_LOG_CRIT, buf);
+
+                    if ( ZClientEnabled )
+                    {
+                        ZClient->RunningZNodesDelete();
+                    }
+                    else
+                    {
+                        CNode  *node = Nodes->GetFirstNode();
+                        while (node)
+                        {
+                            ReqQueue.enqueueDownReq( node->GetPNid() );
+                        }
+                    }
                     nonresponsiveTime_.tv_sec = 0;
                     nonresponsiveTime_.tv_nsec = 0;
-
-                    char buf[MON_STRING_BUF_SIZE];
-                    sprintf(buf, "[%s], Sync thread timeout detected (timeout"
-                            "=%d). Scheduling Node down.\n", method_name,
-                            monSyncTimeout_);
-                    mon_log_write(MON_HEALTHCHECK_TEVENT_4, SQ_LOG_ERR, buf);
                 }
             }
         }
@@ -648,6 +756,11 @@
     if ( SQ_theLocalIOToClient )
     {
         SQ_theLocalIOToClient->refreshWDT(++refreshCounter_);
+#if 0
+        if (trace_settings & TRACE_HEALTH)
+            trace_printf("%s@%d Watchdog timer refreshed (%d)\n"
+                        , method_name, __LINE__, refreshCounter_ );
+#endif
     }
 #endif
 
@@ -671,7 +784,13 @@
     ts.tv_sec += 1; // wake up every second
 
     wakeupTimeSaved_ = ts.tv_sec;
-
+#if 0
+    if (trace_settings & TRACE_HEALTH)
+    {
+        trace_printf("%s@%d Set Timer event: wakeup=%lld\n"
+                    , method_name, __LINE__, wakeupTimeSaved_ );
+    }
+#endif
     TRACE_EXIT;
 }
 
@@ -761,4 +880,53 @@
     TRACE_EXIT; 
 }
 
+// Determine if it is time to log a health message and then do it if needed
+void CHealthCheck::timeToLogHealth(struct timespec &currentTime)
+{
+    const char method_name[] = "CHealthCheck::timeToLogHealth";
+    TRACE_ENTRY;
+
+    const char message_tag[] = "Health Status";
+
+#if 0
+    if (trace_settings & TRACE_HEALTH)
+    {
+        trace_printf("%s@%d Timer: nextHealthLogTime=%ld, current=%ld\n"
+                    , method_name, __LINE__, nextHealthLogTime_.tv_sec, currTime_.tv_sec );
+    }
+#endif
+
+    if (currentTime.tv_sec >= nextHealthLogTime_.tv_sec)
+    { // Time to log health status
+        int readyCount = 0;
+        int upCount = Nodes->GetPNodesUpCount(readyCount);
+    
+        char buf[MON_STRING_BUF_SIZE];
+        sprintf(buf, "[%s] - Node: %s, pnid=%d, state=%s, master=%s\n"
+               , message_tag
+               , MyNode->GetName(), MyPNID, StateString(MyNode->GetState()), MasterMonitorName );
+        mon_log_write(MON_HEALTHCHECK_TIMETOLOGHEALTH, SQ_LOG_INFO, buf);
+        sprintf(buf, "[%s] - Cluster: node count=%d (State Up - count=%d, Txn Services Ready - count=%d)\n"
+               , message_tag
+               , Nodes->GetPNodesCount(), upCount, readyCount );
+        mon_log_write(MON_HEALTHCHECK_TIMETOLOGHEALTH, SQ_LOG_INFO, buf);
+        
+        nextHealthLogTime_.tv_sec += healthLoggingFrequency_;
+    }
+    TRACE_EXIT; 
+}
+
+void CHealthCheck::triggerTimeToLogHealth( void )
+{
+    const char method_name[] = "CHealthCheck::triggerTimeToLogHealth";
+    TRACE_ENTRY;
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_HEALTH))
+    {
+        trace_printf( "%s@%d - Trigger Health Status logging!\n"
+                    , method_name, __LINE__);
+    }
+    clock_gettime(CLOCK_REALTIME, &currTime_);
+    nextHealthLogTime_ = currTime_;
+    TRACE_EXIT; 
+}
 
diff --git a/core/sqf/monitor/linux/healthcheck.h b/core/sqf/monitor/linux/healthcheck.h
index e637400..94c7704 100644
--- a/core/sqf/monitor/linux/healthcheck.h
+++ b/core/sqf/monitor/linux/healthcheck.h
@@ -62,6 +62,7 @@
     virtual ~CHealthCheck();
 
     inline HealthCheckStates GetState( void ) { CAutoLock alock(healthCheckLock_.getLocker()); return( state_ ); }
+    inline int getSyncTimeout( void ) { return( monSyncTimeout_ ); }
     void start();
     void shutdownWork();
     void setState(HealthCheckStates st, long long param1 = 0);
@@ -72,8 +73,13 @@
     static void sigusr2SignalHandler (int , siginfo_t *, void *);
 
     pthread_t tid() { return thread_id_; }
+    void timeToLogHealth(struct timespec &ts);
+    void triggerTimeToLogHealth( void );
 
+    enum { HEALTH_LOGGING_FREQUENCY_DEFAULT = 3600 }; // Default 1 hour between health log messages
+    enum { HEALTH_LOGGING_FREQUENCY_MIN = 60 };       // Mininum 1 minute between health log messages
     enum { QUIESCE_TIMEOUT_DEFAULT = 30 };  // Max seconds to wait for SE processes to exit 
+    enum { SYNC_TIMEOUT_DEFAULT = 900 };    // Max seconds to wait for synchThread(main) Allgather IO completion
 
 private:
 
@@ -97,6 +103,8 @@
     struct timespec lastReqCheckTime_;  // last time when request was checked for responsiveness
     struct timespec lastSyncCheckTime_; // last time when sync thread was checked for responsiveness
     struct timespec nonresponsiveTime_; // start time when Sync thread became unresponsive
+    struct timespec nextHealthLogTime_; // next time this monitor's health is to be logged
+    int  healthLoggingFrequency_;       // Seconds between health log messages
     long long wakeupTimeSaved_;         // time when healthcheck thread should wakeup, in secs.
     CProcess * watchdogProcess_;        // ptr to the watchdog process object
     CProcess * smserviceProcess_;       // ptr to the smservice process object
diff --git a/core/sqf/monitor/linux/internal.h b/core/sqf/monitor/linux/internal.h
index 5dc6456..63a46fa 100644
--- a/core/sqf/monitor/linux/internal.h
+++ b/core/sqf/monitor/linux/internal.h
@@ -30,15 +30,18 @@
 {
       InternalType_Null             // Noop
     , InternalType_ActivateSpare    // activate a spare node
-    , InternalType_Clone            // Add clone process to monitor
+    , InternalType_Clone            // (TNS) Add clone process to monitor
+    , InternalType_CreatePrimitives // Create primitive processes local request
+                                    // generated during monitor initialization
+                                    // it is never generated by remote monitor
     , InternalType_Device           // Add or Change device to monitor
     , InternalType_Down             // Node down
     , InternalType_Dump             // Dump process
     , InternalType_DumpComplete     // Dump process complete
     , InternalType_Event            // Send Event to process
-    , InternalType_Exit             // Delete process for monitor
-    , InternalType_IoData           // Stdin/Stdout data for a process
-    , InternalType_Kill             // Kill monitored process
+    , InternalType_Exit             // (TNS) Delete process for monitor
+    , InternalType_IoData           // (TNS) Stdin/Stdout data for a process
+    , InternalType_Kill             // (TNS) Kill monitored process
     , InternalType_NameServerAdd    // Add NameServer to configuration database
     , InternalType_NameServerDelete // Delete NameServer from configuration database
     , InternalType_NodeAdd          // Add Node to configuration database
@@ -46,36 +49,24 @@
     , InternalType_NodeDelete       // Delete Node from configuration database
     , InternalType_NodeDeleted      // Reload Node configuration and send node deleted notice
     , InternalType_NodeName         // Node Name Change 
-    , InternalType_Notify           // Register for monitoring of process death
+    , InternalType_Notify           // (TNS) Register for monitoring of process death
+    , InternalType_Open             // (TNS) Add open to monitor
     , InternalType_PersistAdd       // Add persist template to configuration database
     , InternalType_PersistDelete    // Delete persist template from configuration database
-    , InternalType_Process          // Add process to monitor
-    , InternalType_ProcessInit      // Process fork completed
-    , InternalType_Open             // Add open to monitor
-    , InternalType_Set              // Add or change configuration key
-    , InternalType_StdinReq         // Request stdin data via ancestor
-    , InternalType_Sync             // Sync monitor for Bcast operation
-    , InternalType_Up               // Node up
-    , InternalType_CreatePrimitives // Create primitive processes local request
-                                    // generated during monitor initialization
-                                    // it is never generated by remote monitor
-    , InternalType_Quiesce          // Quiesce started
     , InternalType_PostQuiece       // Quiesce ended
+    , InternalType_Process          // (TNS) Add process to monitor
+    , InternalType_ProcessInit      // (TNS) Process fork completed
+    , InternalType_Quiesce          // Quiesce started
     , InternalType_Revive           // Revive request
-    , InternalType_Snapshot         // Snapshot request
-    , InternalType_UniqStr          // Unique string
-    , InternalType_TmReady          // DTM ready for transactions
-    , InternalType_Shutdown         // Shutdown
     , InternalType_SchedData        // Processor & memory statistics
-    , InternalType_SoftNodeDown     // Soft Node down (SQWatchdog not triggered)
-    , InternalType_SoftNodeUp       // Soft Node up
-};
-
-enum SyncType
-{
-    SyncType_NULL,                  // No sync requested
-    SyncType_TmData,                // sync to exchange TM data amoung TMs
-    SyncType_TmSyncState            // sync the current TmSync state across the cluster
+    , InternalType_Set              // Add or change configuration key
+    , InternalType_Shutdown         // Shutdown
+    , InternalType_Snapshot         // Snapshot request
+    , InternalType_StdinReq         // (TNS) Request stdin data via ancestor
+    , InternalType_TmReady          // DTM ready for transactions
+    , InternalType_Up               // Node up
+    , InternalType_UniqStr          // (TNS) Unique string
+    , InternalType_Invalid          // Invalid
 };
 
 enum SyncState
@@ -91,8 +82,6 @@
 typedef enum {
     State_Default=0,
     State_Quiesce,                  // node quiesce state while going down
-    State_SoftDown,                 // node soft down on DTM abort -> restart
-    State_SoftUp,                   // node soft up on DTM restart
     State_Ready_To_Exit
 } IntNodeState; 
 
@@ -418,18 +407,6 @@
     int spare_pnid;                      // spare node physical node id
 };
 
-struct sync_def
-{
-    SyncType  type;                      // Coordination type
-    int       pnid;                      // My Physical Node id
-    int       syncnid;                   // TM's Logical Node driving the sync
-    int       tmleader;                  // TM Leader's Logical Node id
-    SyncState state;                     // My current TM sync state 
-    int       count;                     // # of requests in data
-    int       length;                    // data buffer length
-    char      data[MAX_SYNC_DATA];       // Length/Data pairs to by replicated across cluster
-};
-
 struct up_def
 {
     int pnid;                       // Physical node id
@@ -473,7 +450,6 @@
         struct open_def    open;
         struct set_def     set;
         struct stdin_req_def stdin_req;
-        struct sync_def    sync;
         struct up_def      up;
         struct spare_def   activate_spare;
         struct uniqstr_def uniqstr;
@@ -502,6 +478,14 @@
     int  nsPNid;   // NS better
 } nodeId_t;
 
+typedef struct nodeSyncInfo_s
+{
+    char nodeName[MPI_MAX_PROCESSOR_NAME];
+    int  pnid;
+    unsigned long long seqNum;
+    unsigned long long reconnectSeqNum;
+} nodeSyncInfo_t;
+
 typedef struct nodeStatus_s
 {
     STATE state;
diff --git a/core/sqf/monitor/linux/lnode.cxx b/core/sqf/monitor/linux/lnode.cxx
index bbe5ac4..c7550f3 100644
--- a/core/sqf/monitor/linux/lnode.cxx
+++ b/core/sqf/monitor/linux/lnode.cxx
@@ -238,6 +238,7 @@
         const char *nodeName = GetNode()->GetName();
         if (IsRealCluster)
         {
+            nodeName = GetNode()->GetFqdn();
             STRCPY(msg->u.request.u.node_added.node_name, nodeName);
         }
         else
@@ -288,6 +289,7 @@
         const char *nodeName = GetNode()->GetName();
         if (IsRealCluster)
         {
+            nodeName = GetNode()->GetFqdn();
             STRCPY(msg->u.request.u.node_changed.node_name, nodeName);
         }
         else
@@ -338,6 +340,7 @@
         const char *nodeName = GetNode()->GetName();
         if (IsRealCluster)
         {
+            nodeName = GetNode()->GetFqdn();
             STRCPY(msg->u.request.u.node_deleted.node_name, nodeName);
         }
         else
@@ -388,6 +391,7 @@
         const char * nodeName = GetNode()->GetName();
         if (IsRealCluster)
         {
+            nodeName = GetNode()->GetFqdn();
             STRCPY(msg->u.request.u.down.node_name, nodeName);
         }
         else
@@ -560,176 +564,157 @@
     return( lnodes_->GetNode()->IsKillingNode() );
 }
 
-CLNode *CLNode::Link (CLNode * entry)
+CLNode *CLNode::LinkAfter( CLNode * &tail, CLNode * entry )
 {
-    const char method_name[] = "CLNode::Link";
+    const char method_name[] = "CLNode::LinkAfter";
     TRACE_ENTRY;
 
-    next_ = entry;
     entry->prev_ = this;
+    if (next_ == NULL)
+    {
+        entry->next_ = NULL;
+        tail = entry;
+    }
+    else
+    {
+        entry->next_ = next_;
+        next_->prev_ = entry;
+    }
+    next_ = entry;
 
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d - Linked logical node object "
+                      "tail=%d\n"
+                      "\t\tthis: prev=%d, this=%d, next=%d\n"
+                      "\t\tentry: prev=%d, entry=%d, next=%d\n"
+                    , method_name, __LINE__
+                    , tail->GetNid()
+                    , prev_?prev_->GetNid():-1
+                    , GetNid()
+                    , next_?next_->GetNid():-1
+                    , entry->prev_?entry->prev_->GetNid():-1
+                    , entry->GetNid()
+                    , entry->next_?entry->next_->GetNid():-1 );
+    }
+    
     TRACE_EXIT;
     return entry;
 }
 
-CLNode *CLNode::LinkP(CLNode * entry)
+CLNode *CLNode::LinkBefore( CLNode * &head, CLNode * entry )
 {
-    const char method_name[] = "CLNode::LinkP";
+    const char method_name[] = "CLNode::LinkBefore";
     TRACE_ENTRY;
 
-    nextP_ = entry;
+    entry->next_ = this;
+    if (prev_ == NULL)
+    {
+        entry->prev_ = NULL;
+        head = entry;
+    }
+    else
+    {
+        entry->prev_ = prev_;
+        prev_->next_ = entry;
+    }
+    prev_ = entry;
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d - Linked logical node object "
+                      "head=%d\n"
+                      "\t\tthis: prev=%d, this=%d, next=%d\n"
+                      "\t\tentry: prev=%d, entry=%d, next=%d\n"
+                    , method_name, __LINE__
+                    , head->GetNid()
+                    , prev_?prev_->GetNid():-1
+                    , GetNid()
+                    , next_?next_->GetNid():-1
+                    , entry->prev_?entry->prev_->GetNid():-1
+                    , entry->GetNid()
+                    , entry->next_?entry->next_->GetNid():-1 );
+    }
+    
+    TRACE_EXIT;
+    return entry;
+}
+
+CLNode *CLNode::LinkAfterP( CLNode * &tail, CLNode * entry )
+{
+    const char method_name[] = "CLNode::LinkAfterP";
+    TRACE_ENTRY;
+
     entry->prevP_ = this;
+    if (nextP_ == NULL)
+    {
+        entry->nextP_ = NULL;
+        tail = entry;
+    }
+    else
+    {
+        entry->nextP_ = nextP_;
+        nextP_->prevP_ = entry;
+    }
+    nextP_ = entry;
 
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d - Linked logical node object "
+                      "tail=%d\n"
+                      "\t\tthis: prev=%d, this=%d, next=%d\n"
+                      "\t\tentry: prev=%d, entry=%d, next=%d\n"
+                    , method_name, __LINE__
+                    , tail->GetNid()
+                    , prevP_?prevP_->GetNid():-1
+                    , GetNid()
+                    , nextP_?nextP_->GetNid():-1
+                    , entry->prevP_?entry->prevP_->GetNid():-1
+                    , entry->GetNid()
+                    , entry->nextP_?entry->nextP_->GetNid():-1 );
+    }
+    
     TRACE_EXIT;
     return entry;
 }
 
-#ifndef NAMESERVER_PROCESS
-void CLNode::PrepareForTransactions( bool activatingSpare )
+CLNode *CLNode::LinkBeforeP( CLNode * &head, CLNode * entry )
 {
-    const char method_name[] = "CLNode::PrepareForTransactions";
+    const char method_name[] = "CLNode::LinkBeforeP";
     TRACE_ENTRY;
 
-    struct  message_def *msg;
-
-    if ( trace_settings & 
-        (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC | TRACE_INIT) )
+    entry->nextP_ = this;
+    if (prevP_ == NULL)
     {
-        trace_printf( "%s@%d -  %s (nid=%d, state=%d) sending prepare notice to DTM and SPX\n"
+        entry->prevP_ = NULL;
+        head = entry;
+    }
+    else
+    {
+        entry->prevP_ = prevP_;
+        prevP_->nextP_ = entry;
+    }
+    prevP_ = entry;
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d - Linked logical node object "
+                      "head=%d\n"
+                      "\t\tthis: prev=%d, this=%d, next=%d\n"
+                      "\t\tentry: prev=%d, entry=%d, next=%d\n"
                     , method_name, __LINE__
-                    , MyNode->GetName(), MyNode->GetPNid(), MyNode->GetState());
+                    , head->GetNid()
+                    , prevP_?prevP_->GetNid():-1
+                    , GetNid()
+                    , nextP_?nextP_->GetNid():-1
+                    , entry->prevP_?entry->prevP_->GetNid():-1
+                    , entry->GetNid()
+                    , entry->nextP_?entry->nextP_->GetNid():-1 );
     }
-
-    if ( MyNode->GetState() == State_Up )
-    {
-        CLNode *lnode = MyNode->GetFirstLNode();
-        for ( ; lnode; lnode = lnode->GetNextP() )
-        {
-            // Send local DTM processes a node prepare message for each
-            // logical node activated by spare node
-            CProcess   *process = lnode->GetProcessLByType( ProcessType_DTM );
-            if ( process )
-            {
-                // Record statistics (sonar counters)
-                if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
-                   MonStats->notice_node_up_Incr();
-            
-                // send node prepare notice to our node's DTM process
-                msg = new struct message_def;
-                msg->type = MsgType_NodePrepare;
-                msg->noreply = true;
-                msg->u.request.type = ReqType_Notice;
-                msg->u.request.u.prepare.nid = Nid;
-                msg->u.request.u.prepare.takeover = activatingSpare ? true : false;
-                const char * nodeName = GetNode()->GetName();
-                STRCPY(msg->u.request.u.prepare.node_name, nodeName);
-                SQ_theLocalIOToClient->putOnNoticeQueue( process->GetPid()
-                                                       , process->GetVerifier()
-                                                       , msg
-                                                       , NULL);
-            
-                if ( trace_settings & 
-                    (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC | TRACE_INIT) )
-                {
-                    trace_printf( "%s@%d - Sending node %d (takeover=%d) prepare notice to DTM %s (pid=%d)\n"
-                                , method_name, __LINE__
-                                , Nid , msg->u.request.u.prepare.takeover
-                                , process->GetName(), process->GetPid() );
-                                
-                }
-            }
-
-            // Send local SPX processes a node prepare message for each
-            // logical node activated by spare node
-            process = lnode->GetProcessLByType( ProcessType_SPX );
-            if ( process )
-            {
-                // Record statistics (sonar counters)
-                if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
-                   MonStats->notice_node_up_Incr();
-            
-                // send node prepare notice to our node's DTM process
-                msg = new struct message_def;
-                msg->type = MsgType_NodePrepare;
-                msg->noreply = true;
-                msg->u.request.type = ReqType_Notice;
-                msg->u.request.u.prepare.nid = Nid;
-                msg->u.request.u.prepare.takeover = activatingSpare ? true : false;
-                const char * nodeName = GetNode()->GetName();
-                STRCPY(msg->u.request.u.prepare.node_name, nodeName);
-                SQ_theLocalIOToClient->putOnNoticeQueue( process->GetPid()
-                                                       , process->GetVerifier()
-                                                       , msg
-                                                       , NULL);
-            
-                if ( trace_settings & 
-                    (TRACE_RECOVERY | TRACE_REQUEST | TRACE_INIT) )
-                {
-                    trace_printf( "%s@%d - Sending node %d prepare notice to SPX %s (pid=%d)\n"
-                                , method_name, __LINE__, Nid
-                                , process->GetName(), process->GetPid());
-                }
-            }
-        }
-    }
-
+    
     TRACE_EXIT;
+    return entry;
 }
-#endif
-
-#ifndef NAMESERVER_PROCESS
-void CLNode::SendDTMRestarted( void )
-{
-    const char method_name[] = "CLNode::SendDTMRestarted";
-    TRACE_ENTRY;
-
-    struct  message_def *msg;
-
-    if ( trace_settings &
-        (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC | TRACE_INIT) )
-    {
-        trace_printf( "%s@%d -  %s (pnid=%d, state=%d) sending DTM restarted in nid=%d notice to local DTMs\n"
-                    , method_name, __LINE__
-                    , MyNode->GetName(), MyNode->GetPNid(), MyNode->GetState(), GetNid() );
-    }
-
-    if ( MyNode->GetState() == State_Up )
-    {
-        CLNode *lnode = MyNode->GetFirstLNode();
-        for ( ; lnode; lnode = lnode->GetNextP() )
-        {
-            // Send local DTM processes a DTM restarted message
-            CProcess   *process = lnode->GetProcessLByType( ProcessType_DTM );
-            if ( process )
-            {
-                // send node prepare notice to our node's DTM process
-                msg = new struct message_def;
-                msg->type = MsgType_TmRestarted;
-                msg->noreply = true;
-                msg->u.request.type = ReqType_Notice;
-                msg->u.request.u.tm_restart.nid = Nid;
-                msg->u.request.u.tm_restart.pnid = GetNode()->GetPNid();
-                const char * nodeName = GetNode()->GetName();
-                STRCPY(msg->u.request.u.tm_restart.node_name, nodeName);
-                SQ_theLocalIOToClient->putOnNoticeQueue( process->GetPid()
-                                                       , process->GetVerifier()
-                                                       , msg
-                                                       , NULL);
-
-                if ( trace_settings &
-                    (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC | TRACE_INIT) )
-                {
-                    trace_printf( "%s@%d - Sending nid=%d DTM restarted notice to DTM %s (nid=%d,pid=%d)\n"
-                                , method_name, __LINE__
-                                , Nid , process->GetName(), process->GetNid(), process->GetPid() );
-                }
-            }
-        }
-    }
-
-    TRACE_EXIT;
-}
-#endif
 
 #ifndef NAMESERVER_PROCESS
 void CLNode::SetAffinity( pid_t pid, PROCESSTYPE type )
@@ -974,6 +959,7 @@
     const char * nodeName = GetNode()->GetName();
     if (IsRealCluster)
     {
+        nodeName = GetNode()->GetFqdn();
         STRCPY(msg->u.request.u.up.node_name, nodeName);
     }
     else
@@ -1068,7 +1054,33 @@
     }
     else
     {
-        tail_ = tail_->Link(lnode);
+        // add to list in nid sort order
+        if (lnode->GetNid() < head_->GetNid())
+        { // link new lnode to the begining
+            head_->LinkBefore( head_, lnode );
+        }
+        else if (lnode->GetNid() > tail_->GetNid())
+        { // link new lnode to the end
+            tail_->LinkAfter( tail_, lnode );
+        }
+        else
+        {
+            CLNode *entry = head_;
+            CLNode *prevEntry = NULL;
+            while (entry)
+            { // walk the list
+                if (lnode->GetNid() > entry->GetNid())
+                { // new lnode is greater than current list entry
+                    prevEntry = entry;
+                    entry = prevEntry->GetNext();
+                }
+                else
+                { // new lnode is less than current list entry
+                    prevEntry->LinkAfter( tail_, lnode );
+                    entry = NULL;
+                }
+            }
+        }
     }
 
     if (trace_settings & TRACE_INIT)
@@ -1116,7 +1128,33 @@
     }
     else
     {
-        tail_ = tail_->LinkP(lnode);
+        // add to list in nid sort order
+        if (lnode->GetNid() < head_->GetNid())
+        { // link new lnode to the begining
+            head_->LinkBeforeP( head_, lnode );
+        }
+        else if (lnode->GetNid() > tail_->GetNid())
+        { // link new lnode to the end
+            tail_->LinkAfterP( tail_, lnode );
+        }
+        else
+        {
+            CLNode *entry = head_;
+            CLNode *prevEntry = NULL;
+            while (entry)
+            { // walk the list
+                if (lnode->GetNid() > entry->GetNid())
+                { // new lnode is greater than current list entry
+                    prevEntry = entry;
+                    entry = prevEntry->GetNext();
+                }
+                else
+                { // new lnode is less than current list entry
+                    prevEntry->LinkAfterP( tail_, lnode );
+                    entry = NULL;
+                }
+            }
+        }
     }
 
     if (trace_settings & TRACE_INIT)
@@ -1290,7 +1328,7 @@
 
     for (int i = 0; i <  clusterConfig->GetLNodesCount(); i++ )
     {
-        if (LNode[i]->GetNid() == nid)
+        if (indexToNid_[i] == nid)
         {
             return(i);
         }
diff --git a/core/sqf/monitor/linux/lnode.h b/core/sqf/monitor/linux/lnode.h
index f1ec2f8..3929d05 100644
--- a/core/sqf/monitor/linux/lnode.h
+++ b/core/sqf/monitor/linux/lnode.h
@@ -90,8 +90,6 @@
                                       bool system_messages,
                                       struct timespec *creation_time,
                                       int origPNidNs );
-    void    PrepareForTransactions( bool activatingSpare );
-    void    SendDTMRestarted( void );
     inline void SetCpuUser( long long num ) { cpuUser_ = num; }
     inline void SetCpuNice( long long num ) { cpuNice_ = num; }
     inline void SetCpuSystem( long long num ) { cpuSystem_ = num; }
@@ -106,8 +104,11 @@
 
     bool    IsKillingNode( void );
 
-    CLNode *Link( CLNode *entry );
-    CLNode *LinkP( CLNode *entry );
+    CLNode  *LinkAfter( CLNode * &tail, CLNode * entry );
+    CLNode  *LinkBefore( CLNode * &head, CLNode * entry );
+    CLNode  *LinkAfterP( CLNode * &tail, CLNode * entry );
+    CLNode  *LinkBeforeP( CLNode * &head, CLNode * entry );
+
     void    SetAffinity( pid_t pid, PROCESSTYPE type );
     void    SetAffinity( CProcess *process );
 
diff --git a/core/sqf/monitor/linux/macros.gmk b/core/sqf/monitor/linux/macros.gmk
index ea981e0..c30b20b 100644
--- a/core/sqf/monitor/linux/macros.gmk
+++ b/core/sqf/monitor/linux/macros.gmk
@@ -81,7 +81,7 @@
 CXXFLAGS	= $(CXXWARN) $(DBG_FLGS) $(OPTIM_FLGS)
 
 
-CWARN		= -Wno-long-long -fmessage-length=0
+CWARN		= -Wno-long-long -Wno-deprecated -fmessage-length=0
 CXXWARN		= -Wno-long-long -Wno-deprecated -fmessage-length=0
 
 INCLUDES	= -I$(INCEXPDIR) $(INCMISCDIR)
diff --git a/core/sqf/monitor/linux/makefile b/core/sqf/monitor/linux/makefile
index 3d16bab..b3f97c0 100644
--- a/core/sqf/monitor/linux/makefile
+++ b/core/sqf/monitor/linux/makefile
@@ -21,6 +21,8 @@
 
 include ./macros.gmk
 
+# set USE_DUMA=1 to setup DUMA and add to link section in corresponding program
+USE_DUMA=0
 
 # set USE_DMALLOC=1 to setup dmalloc
 USE_DMALLOC=1
@@ -33,7 +35,7 @@
 #DEBUG = -O0 -g3
 DEBUG = $(CFLAGS)
 
-FLAGS = $(DEBUG) -Wno-deprecated -fmessage-length=0 -fPIC
+FLAGS = $(DEBUG) -Wno-error=format-security -Wno-deprecated -fmessage-length=0 -fPIC
 ifeq ($(USE_DMALLOC),1)
  FLAGS+= -DDMALLOC
 endif
@@ -45,16 +47,22 @@
 # Uncomment the following line to see which compiler option controls
 #   a particular diagnostic (GNU compiler).
 # FLAGS+= -fdiagnostics-show-option
-FLAGS+= -Wall -Wextra -Werror
+FLAGS+= -Wall -Wextra -Werror -Wno-unused-variable
 
 # Flags for use in compiling Seabed trace modules
-SBFLAGS = -Wall -Wextra -pedantic -Werror  -Wno-long-long $(DBG_FLGS) $(OPTIM_FLGS) -Xlinker --copy-dt-needed-entries -fPIC
+SBFLAGS = -Wall -Wextra -pedantic -Werror -Wno-error=format-security  -Wno-long-long $(DBG_FLGS) $(OPTIM_FLGS) -Xlinker --copy-dt-needed-entries -fPIC
 
 COMMONLOGGERDIR = ../../commonLogger
 
 INCLUDE = monitor.h msgdef.h internal.h monlogging.h clio.h mlio.h localio.h lock.h
 
-LIBS = -L$(LIBEXPDIR) -L$(ZOOKEEPER_DIR)/lib -lrt -lsblogalt -lzookeeper_mt -ltrafconfig
+ifeq ($(USE_DUMA),1)
+    # Make sure 'libduma.a' is in the lib64d directory
+    LIBDUMA= -lduma
+endif
+
+CC_RUNTIME_LIB = -L/usr/lib/x86_64-linux-gnu -L/lib/x86_64-linux-gnu -L$(TOOLSDIR)/apache-log4cxx-0.10.0/lib
+LIBS = $(CC_RUNTIME_LIB) -L$(LIBEXPDIR) -L$(ZOOKEEPER_DIR)/lib -lrt -lsblogalt -ltrafconfig -lzookeeper_mt 
 
 LIBS+= -lmpich
 
@@ -100,7 +108,6 @@
 MONITORSRC += notice.cxx
 MONITORSRC += config.cxx
 MONITORSRC += monlogging.cxx
-MONITORSRC += tmsync.cxx
 MONITORSRC += device.cxx
 MONITORSRC += monprof.cxx
 MONITORSRC += montrace.cxx
@@ -123,6 +130,7 @@
 MONITORSRC += reqevent.cxx
 MONITORSRC += reqexit.cxx
 MONITORSRC += reqget.cxx
+MONITORSRC += reqinstanceid.cxx
 MONITORSRC += reqkill.cxx
 MONITORSRC += reqmonstats.cxx
 MONITORSRC += reqmount.cxx
@@ -145,7 +153,6 @@
 MONITORSRC += reqstartup.cxx
 MONITORSRC += reqtmleader.cxx
 MONITORSRC += reqtmready.cxx
-MONITORSRC += reqtmsync.cxx
 MONITORSRC += reqzoneinfo.cxx
 MONITORSRC += replicate.cxx
 MONITORSRC += gentrap.cxx
@@ -161,7 +168,6 @@
 MONITOROBJS += $(OUTDIR)/props.o
 MONITOROBJS += $(OUTDIR)/config.o
 MONITOROBJS += $(OUTDIR)/monlogging.o
-MONITOROBJS += $(OUTDIR)/tmsync.o
 MONITOROBJS += $(OUTDIR)/device.o
 MONITOROBJS += $(OUTDIR)/mlio.o
 MONITOROBJS += $(OUTDIR)/monprof.o
@@ -193,6 +199,7 @@
 MONITOROBJS += $(OUTDIR)/reqevent.o
 MONITOROBJS += $(OUTDIR)/reqexit.o
 MONITOROBJS += $(OUTDIR)/reqget.o
+MONITOROBJS += $(OUTDIR)/reqinstanceid.o
 MONITOROBJS += $(OUTDIR)/reqkill.o
 MONITOROBJS += $(OUTDIR)/reqmonstats.o
 MONITOROBJS += $(OUTDIR)/reqmount.o
@@ -216,7 +223,6 @@
 MONITOROBJS += $(OUTDIR)/reqstartup.o
 MONITOROBJS += $(OUTDIR)/reqtmleader.o
 MONITOROBJS += $(OUTDIR)/reqtmready.o
-MONITOROBJS += $(OUTDIR)/reqtmsync.o
 MONITOROBJS += $(OUTDIR)/reqzoneinfo.o
 ifeq ($(USE_FORK_SUSPEND_RESUME),1)
     MONITOROBJS += $(OUTDIR)/monrs.o
@@ -336,13 +342,6 @@
 MONWDTOBJS = $(OUTDIR)/monwdt.o
 MONWDTOBJS += $(OUTDIR)/versmwdt.o
 
-ZOOMONOBJS = $(OUTDIR)/zootest.o
-ZOOMONOBJS += $(OUTDIR)/zclient.o
-ZOOMONOBJS += $(OUTDIR)/lock.o
-ZOOMONOBJS += $(OUTDIR)/montrace.o
-ZOOMONOBJS += $(OUTDIR)/CommonLogger.o
-ZOOMONOBJS += $(OUTDIR)/type2str.o
-
 MSG_LOG_OBJS = $(OUTDIR)/monlogging.o
 
 TRACE_LOG_OBJS = $(OUTDIR)/trace.o
@@ -374,7 +373,6 @@
 ALLOBJS += $(CONFOBJS)
 ALLOBJS += $(MEMLOGOBJS)
 ALLOBJS += $(RTSIGBLOCK_OBJS)
-ALLOBJS += $(ZOOMONOBJS)
 ALLOBJS += $(TYPE2STR_OBJS)
 
 SHAREDLIBS =   $(LIBEXPDIR)/libtrafconfig.so
@@ -387,40 +385,28 @@
 PGMS +=  $(BINEXPDIR)/pstartd
 PGMS +=  $(BINEXPDIR)/trafns
 
-TEST_PGMS = $(OUTDIR)/client
-TEST_PGMS += $(OUTDIR)/client2
-TEST_PGMS += $(OUTDIR)/montim
-TEST_PGMS += $(OUTDIR)/nsclient
-TEST_PGMS += $(OUTDIR)/server
-TEST_PGMS += $(OUTDIR)/nsserver
-TEST_PGMS += $(OUTDIR)/getseq
-TEST_PGMS += $(OUTDIR)/notify
-TEST_PGMS += $(OUTDIR)/testtm
-TEST_PGMS += $(OUTDIR)/pingpong2
-TEST_PGMS += $(OUTDIR)/testspx
-TEST_PGMS += $(OUTDIR)/testconf
-TEST_PGMS += $(OUTDIR)/zootest
+TEST_PGMS  = $(OUTDIR)/testconf
 
 # Compile rules for building sources
 
 $(BINEXPDIR)/%:%.cxx $(INCLUDE) $(CLIENTOBJS)
-	@echo 'Building target: $@'
+	@echo 'Building target0: $@'
 	@echo 'Invoking: C++ Compile & Linker'
 	@echo $(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(INCLUDES) -o $@ -lrt  $(CLIENTOBJS) $(TRACE_LOG_OBJS) $<
-	@$(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(INCLUDES) -o $@ -lrt $(CLIENTOBJS) $(TRACE_LOG_OBJS) $<
+	@$(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(INCLUDES) -o $@ $(CLIENTOBJS) $(TRACE_LOG_OBJS) -lrt $(LIBS) $<
 	@echo 'Finished building target: $@'
 	@echo ' '
 
 $(OUTDIR)/%:%.cxx $(INCLUDE) $(CLIENTOBJS)
-	@echo 'Building target: $@'
+	@echo 'Building target1: $@'
 	@echo 'Invoking: C++ Compile & Linker'
-	@echo $(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(INCLUDES) -o $@ -lrt  $(CLIENTOBJS) $(TRACE_LOG_OBJS) $<
-	@$(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(INCLUDES) -o $@ -lrt $(LIBS) $(CLIENTOBJS) $(TRACE_LOG_OBJS) $<
+	@echo $(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(INCLUDES) -o $@ $(CLIENTOBJS) $(TRACE_LOG_OBJS) -lrt $(LIBS)  $<
+	@$(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(INCLUDES) -o $@ $(CLIENTOBJS) $(TRACE_LOG_OBJS) -lrt $(LIBS) $<
 	@echo 'Finished building target: $@'
 	@echo ' '
 
 $(OUTDIR)/%.o:%.cxx
-	@echo 'Building target: $@'
+	@echo 'Building target2: $@'
 	@echo 'Invoking: C++ Compiler'
 	@echo $(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(INCLUDES) -c -o $@ $<
 	@if [ -d "$(OUTDIR)" ]; then x=1; else mkdir -p "$(OUTDIR)"; fi
@@ -429,7 +415,7 @@
 	@echo ' '
 
 $(OUTDIR)/%.o:%.cc
-	@echo 'Building target: $@'
+	@echo 'Building target3: $@'
 	@echo 'Invoking: C++ Compiler'
 	@echo $(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(INCLUDES) -c -o $@ $<
 	@if [ -d "$(OUTDIR)" ]; then x=1; else mkdir -p "$(OUTDIR)"; fi
@@ -500,48 +486,40 @@
 $(OUTDIR)/testconf : $(CONFOBJS) $(TRACE_LOG_OBJS)
 	@echo 'Building target: $@'
 	@echo 'Invoking: Linker'
-	@echo $(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(INCLUDES) -o $@ -lrt  $^
-	@$(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(INCLUDES) -o $@ -lrt $(LIBS) $^
-	@echo 'Finished building target: $@'
-	@echo ' '
-
-$(OUTDIR)/zootest : $(ZOOMONOBJS) $(TRACE_LOG_OBJS) $(MSG_LOG_OBJS)
-	@echo 'Building target: $@'
-	@echo 'Invoking: C++ Compile & Linker'
-	@echo $(CXX) $(CDEPFLAGS) $(FLAGS) -pthread $(OPTIONS) $(INCLUDES) -o $@ $(LIBS) $^
-	@$(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) -pthread $(INCLUDES) -o $@ $(LIBS) $^
+	@echo $(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(INCLUDES) -o $@ $^ -lrt  $(LIBS)
+	@$(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(INCLUDES) -o $@ $^ -lrt $(LIBS)
 	@echo 'Finished building target: $@'
 	@echo ' '
 
 $(BINEXPDIR)/shell : $(SHELLOBJS) $(TRACE_LOG_OBJS)
 	@echo 'Building target: $@'
 	@echo 'Invoking: Linker'
-	@echo $(CXX) -lreadline -lcurses $(LNK_FLGS) -o$@ -lrt $^
-	@$(CXX) -lreadline -lcurses $(LNK_FLGS) -o$@ -lrt $(LIBS) $^
+	@echo $(CXX) -o$@ $^ -lreadline -lcurses $(LNK_FLGS) -lrt  $(LIBS)
+	@$(CXX) -o$@ $^ -lreadline -lcurses $(LNK_FLGS) -lrt $(LIBS)
 	@echo 'Finished building target: $@'
 	@echo ' '
 
 $(BINEXPDIR)/sqwatchdog : $(WATCHDOGOBJS) $(CLIENTOBJS) $(TRACE_LOG_OBJS) $(MSG_LOG_OBJS)
 	@echo 'Building target: $@'
 	@echo 'Invoking: C++ Compile & Linker'
-	@echo $(CXX) $(CDEPFLAGS) $(FLAGS) -pthread $(OPTIONS) $(INCLUDES) -o $@ $(LIBS) $^
-	@$(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) -pthread $(INCLUDES) -o $@ $(LIBS) $^
+	@echo $(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) -pthread $(INCLUDES) $^ -o $@ $(LIBS)
+	@$(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) -pthread $(INCLUDES) $^ -o $@ $(LIBS)
 	@echo 'Finished building target: $@'
 	@echo ' '
 
 $(BINEXPDIR)/pstartd: $(PSTARTDOBJS) $(OUTDIR)/monclio.o $(TRACE_LOG_OBJS)
 	@echo 'Building target: $@'
 	@echo 'Invoking: C++ Compile & Linker'
-	@echo $(CXX) $(CDEPFLAGS) $(FLAGS) -pthread $(OPTIONS) $(INCLUDES) -o $@ $(LIBS) $^
-	@$(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) -pthread $(INCLUDES) -o $@ $(LIBS) $^
+	@echo $(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) -pthread $(INCLUDES) $^ -o $@ $(LIBS)
+	@$(CXX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) -pthread $(INCLUDES) $^ -o $@ $(LIBS)
 	@echo 'Finished building target: $@'
 	@echo ' '
 
 $(BINEXPDIR)/monmemlog : $(MEMLOGOBJS) $(TRACE_LOG_OBJS)
 	@echo 'Building target: $@'
 	@echo 'Invoking: C++ Compile & Linker'
-	@echo $(CXX) $(FLAGS) -pthread $(OPTIONS) $(INCLUDES) -o $@ $(LIBS) $^
-	@$(CXX) $(FLAGS) $(OPTIONS) -pthread $(INCLUDES) -o $@ $(LIBS) $^
+	@echo $(CXX) $(FLAGS) $(OPTIONS) -pthread $(INCLUDES) $^ -o $@ $(LIBS)
+	@$(CXX) $(FLAGS) $(OPTIONS) -pthread $(INCLUDES) $^ -o $@ $(LIBS)
 	@echo 'Finished building target: $@'
 	@echo ' '
 
@@ -551,16 +529,16 @@
 $(BINEXPDIR)/monitor: $(MONITOROBJS)
 	@echo 'Building target: $@'
 	@echo 'Invoking: Linker'
-	@echo $(CXX) $(LNK_FLGS) -o$@ $(MONITOROBJS) $(LIBS) -lz -lcrypto
-	@$(CXX) $(LNK_FLGS) -o$@ $(MONITOROBJS) $(LIBS) -lz -lcrypto
+	@echo $(CXX) $(LNK_FLGS) -o$@ $(MONITOROBJS) $(LIBS) $(LIBDUMA) -lz -lcrypto
+	@$(CXX) $(LNK_FLGS) -o$@ $(MONITOROBJS) $(LIBS) $(LIBDUMA) -lz -lcrypto
 	@echo 'Finished building target: $@'
 	@echo ' '
 
 $(BINEXPDIR)/trafns: $(NSOBJS)
 	@echo 'Building target: $@'
 	@echo 'Invoking: Linker'
-	@echo $(CXX) $(LNK_FLGS) -o$@ $(NSOBJS) $(LIBS) -lz -lcrypto
-	@$(CXX) $(LNK_FLGS) -o$@ $(NSOBJS) $(LIBS) -lz -lcrypto
+	@echo $(CXX) $(LNK_FLGS) -o$@ $(NSOBJS) $(LIBS) $(LIBDUMA) -lz -lcrypto
+	@$(CXX) $(LNK_FLGS) -o$@ $(NSOBJS) $(LIBS) $(LIBDUMA) -lz -lcrypto
 	@echo 'Finished building target: $@'
 	@echo ' '
 
diff --git a/core/sqf/monitor/linux/mlio.cxx b/core/sqf/monitor/linux/mlio.cxx
index 0f16ea0..8a46088 100644
--- a/core/sqf/monitor/linux/mlio.cxx
+++ b/core/sqf/monitor/linux/mlio.cxx
@@ -91,6 +91,7 @@
    sizeof(REQTYPE) + sizeof( Event_Notice_def ),    // ReqType_Event
    sizeof(REQTYPE) + sizeof( Exit_def ),            // ReqType_Exit
    sizeof(REQTYPE) + sizeof( Get_def ),             // ReqType_Get
+   sizeof(REQTYPE) + sizeof( InstanceId_def ),      // ReqType_InstanceId
    sizeof(REQTYPE) + sizeof( Kill_def ),            // ReqType_Kill
    sizeof(REQTYPE) + sizeof( MonStats_def ),        // ReqType_MonStats
    sizeof(REQTYPE) + sizeof( Mount_def ),           // ReqType_Mount
@@ -106,23 +107,22 @@
    sizeof(REQTYPE) + sizeof( NodeInfo_def ),        // ReqType_NodeInfo
    sizeof(REQTYPE) + sizeof( NodeName_def ),        // ReqType_NodeName
    sizeof(REQTYPE) + sizeof( NodeUp_def ),          // ReqType_NodeUp
-   0,                              // ReqType_Notice -- not an actual request
+   0,                                               // ReqType_Notice -- not an actual request
    sizeof(REQTYPE) + sizeof( Notify_def ),          // ReqType_Notify
    sizeof(REQTYPE) + sizeof( Open_def ),            // ReqType_Open
    sizeof(REQTYPE) + sizeof( OpenInfo_def ),        // ReqType_OpenInfo
+   0,                                               // ReqType_PersistAdd (TODO)
+   0,                                               // ReqType_PersistDelete (TODO)
    sizeof(REQTYPE) + sizeof( PNodeInfo_def ),       // ReqType_PNodeInfo
    sizeof(REQTYPE) + sizeof( ProcessInfo_def ),     // ReqType_ProcessInfo
    sizeof(REQTYPE) + sizeof( ProcessInfoCont_def ), // ReqType_ProcessInfoCont
-   sizeof(REQTYPE) + sizeof( ProcessInfo_def ),     // ReqType_ProcessInfoPat
+   sizeof(REQTYPE) + sizeof( ProcessInfo_def ),     // ReqType_ProcessInfoNs
    sizeof(REQTYPE) + sizeof( Set_def ),             // ReqType_Set
    sizeof(REQTYPE) + sizeof( Shutdown_def ),        // ReqType_Shutdown
    sizeof(REQTYPE) + sizeof( ShutdownNs_def ),      // ReqType_ShutdownNs
    sizeof(REQTYPE) + sizeof( Startup_def ),         // ReqType_Startup
-   sizeof(REQTYPE) + sizeof( Stfsd_def ),           // ReqType_Stfsd
    sizeof(REQTYPE) + sizeof( TmLeader_def ),        // ReqType_TmLeader
    sizeof(REQTYPE) + sizeof( TmReady_def ),         // ReqType_TmReady
-   sizeof(REQTYPE) + sizeof( TmSync_def ),          // ReqType_TmSync
-   sizeof(REQTYPE) + sizeof( TransInfo_def ),       // ReqType_TransInfo
    sizeof(REQTYPE) + sizeof( ZoneInfo_def )         // ReqType_ZoneInfo
 };
 
@@ -134,6 +134,7 @@
    sizeof(REPLYTYPE) + sizeof( DelProcessNs_reply_def ),// ReplyType_DelProcessNs
    sizeof(REPLYTYPE) + sizeof( Dump_reply_def ),        // ReplyType_Dump
    sizeof(REPLYTYPE) + sizeof( Get_reply_def ),         // ReplyType_Get
+   sizeof(REPLYTYPE) + sizeof( InstanceId_reply_def ),  // ReplyType_InstanceId
    sizeof(REPLYTYPE) + sizeof( MonStats_reply_def ),    // ReplyType_MonStats
    sizeof(REPLYTYPE) + sizeof( Mount_reply_def ),       // ReplyType_Mount
    sizeof(REPLYTYPE) + sizeof( NewProcess_reply_def ),  // ReplyType_NewProcess
@@ -145,10 +146,7 @@
    sizeof(REPLYTYPE) + sizeof( PNodeInfo_reply_def ),   // ReplyType_PNodeInfo
    sizeof(REPLYTYPE) + sizeof( ProcessInfo_reply_def ), // ReplyType_ProcessInfo
    sizeof(REPLYTYPE) + sizeof( ProcessInfoNs_reply_def ),// ReplyType_ProcessInfoNs
-   sizeof(REPLYTYPE) + sizeof( Stfsd_reply_def ),       // ReplyType_Stfsd
    sizeof(REPLYTYPE) + sizeof( Startup_reply_def ),     // ReplyType_Startup
-   sizeof(REPLYTYPE) + sizeof( TmSync_reply_def ),      // ReplyType_TmSync 
-   sizeof(REPLYTYPE) + sizeof( TransInfo_reply_def ),   // ReplyType_TransInfo 
    sizeof(REPLYTYPE) + sizeof( ZoneInfo_reply_def )     // ReplyType_ZoneInfo
 };
 
@@ -166,20 +164,15 @@
    sizeof(REQTYPE) + sizeof( NodeDeleted_def ),       // MsgType_NodeDeleted
    sizeof(REQTYPE) + sizeof( NodeDown_def ),          // MsgType_NodeDown
    sizeof(REQTYPE) + sizeof( NodeJoining_def ),       // MsgType_NodeJoining
-   sizeof(REQTYPE) + sizeof( NodePrepare_def ),       // MsgType_NodePrepare
    sizeof(REQTYPE) + sizeof( NodeQuiesce_def ),       // MsgType_NodeQuiesce
    sizeof(REQTYPE) + sizeof( NodeUp_def ),            // MsgType_NodeUp
    sizeof(REQTYPE) + sizeof( Open_def ),              // MsgType_Open
    sizeof(REQTYPE) + sizeof( NewProcess_Notice_def ), // MsgType_ProcessCreated
    sizeof(REQTYPE) + sizeof( ProcessDeath_def ),      // MsgType_ProcessDeath
    sizeof(REQTYPE) + sizeof( NodeReInt_def ),         // MsgType_ReintegrationError
-   0, // MsgType_Service
+   0,                                                 // MsgType_Service
    sizeof(REQTYPE) + sizeof( Shutdown_def ),          // MsgType_Shutdown
-   sizeof(REQTYPE) + sizeof( SpareUp_def ),           // MsgType_SpareUp
-   sizeof(REQTYPE) + sizeof( TmRestarted_def ),       // MsgType_TmRestarted
-   sizeof(REQTYPE) + sizeof( TmSyncNotice_def ),      // MsgType_TmSyncAbort
-   sizeof(REQTYPE) + sizeof( TmSyncNotice_def ),      // MsgType_TmSyncCommit
-   sizeof(REQTYPE) + sizeof( UnsolicitedTmSync_def )  // MsgType_UnsolicitedMessage
+   sizeof(REQTYPE) + sizeof( SpareUp_def )            // MsgType_SpareUp
 
 };
 
@@ -964,7 +957,8 @@
         char la_buf[MON_STRING_BUF_SIZE];
         sprintf(la_buf, "[%s], Error= Can't set blocking signal mask for thread! - errno=%d (%s)\n", method_name, rc, strerror(rc));
         mon_log_write(MON_MLIO_SERIALREQ_TH_1, SQ_LOG_CRIT, la_buf);
-        abort();
+
+        mon_failure_exit();
     }
 
     // Record statistics (sonar counters): monitor is busy
@@ -1026,9 +1020,6 @@
     return (void *) errno; // cast
 }
 
-// The tmsync request thread is not used for concurrent requests.  Only the
-// serial request thread is used.
-
 // This is the pending notice thread used to send unsent notices to the client.
 void *
 pendingNoticeThread( void * )
@@ -1046,7 +1037,8 @@
       char la_buf[MON_STRING_BUF_SIZE];
       sprintf(la_buf, "[%s], Error= Can't set blocking signal mask for thread! - errno=%d (%s)\n", method_name, rc, strerror(rc));
       mon_log_write(MON_MLIO_PENDING_TH_1, SQ_LOG_CRIT, la_buf);
-      abort();
+
+      mon_failure_exit();
   }
 
   sigemptyset(&sig_set);
@@ -1216,7 +1208,8 @@
       sprintf(la_buf, "[%s], Monitor port does not contain ':' (%s)\n",
               method_name, MyCommPort);
       mon_log_write(MON_MLIO_INIT_2, SQ_LOG_ERR, la_buf);
-      abort();
+
+      mon_failure_exit();
   }
 
   myPortNum = strtoul(&pPort[1], NULL, 10);
@@ -1226,7 +1219,8 @@
       sprintf(la_buf, "[%s], Cannot convert MyCommPort - errno=%d (%s)\n",
               method_name, errno, strerror(errno));
       mon_log_write(MON_MLIO_INIT_3, SQ_LOG_ERR, la_buf);
-      abort();
+
+      mon_failure_exit();
   }
   sharedSegKeyBase = (key_t) ((nidBase << 28) + (myPortNum & 0xFFFF));
   if (myPortNum > 65535)
@@ -1234,7 +1228,8 @@
       char la_buf[MON_STRING_BUF_SIZE];
       sprintf(la_buf, "[%s], MyCommPort value exceeds 16 bits\n", method_name);
       mon_log_write(MON_MLIO_INIT_4, SQ_LOG_ERR, la_buf);
-      abort();
+
+      mon_failure_exit();
   }
   
   availableBufferCountMin = sharedBuffersMax;
@@ -1304,7 +1299,8 @@
                       char la_buf[MON_STRING_BUF_SIZE];
                       sprintf(la_buf, "[%s], Error= Can't access shared memory segment! - errno=%d (%s)\n", method_name, err, strerror(err));
                       mon_log_write(MON_MLIO_INIT_5, SQ_LOG_CRIT, la_buf);
-                      abort();
+
+                      mon_failure_exit();
                   }
               }
               else
@@ -1313,7 +1309,8 @@
                   char la_buf[MON_STRING_BUF_SIZE];
                   sprintf(la_buf, "[%s], Error= Can't remove existing shared memory segment! - errno=%d (%s)\n", method_name, err, strerror(err));
                   mon_log_write(MON_MLIO_INIT_6, SQ_LOG_CRIT, la_buf);
-                  abort();
+
+                  mon_failure_exit();
               }
           }
           else
@@ -1322,7 +1319,8 @@
               char la_buf[MON_STRING_BUF_SIZE];
               sprintf(la_buf, "[%s], Error= Can't access shared memory segment! - errno=%d (%s)\n", method_name, err, strerror(err));
               mon_log_write(MON_MLIO_INIT_7, SQ_LOG_CRIT, la_buf);
-              abort();
+
+              mon_failure_exit();
           }
       }
       else
@@ -1331,7 +1329,8 @@
           char la_buf[MON_STRING_BUF_SIZE];
           sprintf(la_buf, "[%s], Error= Can't access shared memory segment! - errno=%d (%s)\n", method_name, err, strerror(err));
           mon_log_write(MON_MLIO_INIT_8, SQ_LOG_CRIT, la_buf);
-          abort();
+
+          mon_failure_exit();
       }
   }
   
@@ -1345,7 +1344,8 @@
       char la_buf[MON_STRING_BUF_SIZE];
       sprintf(la_buf, "[%s], Error= Can't map shared memory segment address! - errno=%d (%s)\n", method_name, err, strerror(err));
       mon_log_write(MON_MLIO_INIT_9, SQ_LOG_CRIT, la_buf);
-      abort();
+
+      mon_failure_exit();
   }
 
   memset( clientBuffers, 0, shsize );
@@ -1370,7 +1370,8 @@
     char la_buf[MON_STRING_BUF_SIZE];
     sprintf(la_buf, "[%s], Error= Can't access message queue! - errno=%d (%s)\n", method_name, err, strerror(err));
     mon_log_write(MON_MLIO_INIT_10, SQ_LOG_CRIT, la_buf);
-    abort();
+
+    mon_failure_exit();
   }
 
   errno = 0;
@@ -1391,7 +1392,8 @@
       char la_buf[MON_STRING_BUF_SIZE];
       sprintf(la_buf, "[%s], Error= Can't drain message queue! - errno=%d (%s)\n", method_name, err, strerror(err));
       mon_log_write(MON_MLIO_INIT_11, SQ_LOG_CRIT, la_buf);
-      abort();
+
+      mon_failure_exit();
     }
   }
   // populate client buffer relative index location
@@ -1406,7 +1408,8 @@
       char la_buf[MON_STRING_BUF_SIZE];
       sprintf(la_buf, "[%s], Error= Can't load message queue! - errno=%d (%s)\n", method_name, err, strerror(err));
       mon_log_write(MON_MLIO_INIT_12, SQ_LOG_CRIT, la_buf);
-      abort();
+
+      mon_failure_exit();
     }
   }
 
@@ -1417,7 +1420,8 @@
     char la_buf[MON_STRING_BUF_SIZE];
     sprintf(la_buf, "[%s], Error= Can't lock shared memory segment! - errno=%d (%s)\n", method_name, err, strerror(err));
     mon_log_write(MON_MLIO_INIT_13, SQ_LOG_CRIT, la_buf);
-    abort();
+
+    mon_failure_exit();
   }
 
 
@@ -1465,9 +1469,6 @@
   if (trace_settings & TRACE_INIT)
       trace_printf("%s@%d" " pendingNoticeThread created, threadId=%lx" "\n", method_name, __LINE__, pendingNoticesTid_);
 
-// The tmsync threads is not used for concurrent requests.  Only the
-// serial request thread is used.
-
   // Create the local io buffer cleanup thread
   ret = pthread_create(&lioBufCleanupTid_, NULL, lioBufCleanupThread, NULL);
   if (ret != 0)
@@ -1546,6 +1547,7 @@
 struct message_def *
 SQ_LocalIOToClient::acquireMsg( int pid, Verifier_t verifier )
 {
+    bool done = false;
     struct message_def *msg = NULL;
     struct msqid_ds  mds;
     int ret;
@@ -1556,45 +1558,58 @@
 
     if (acquiredBufferCount < SQ_LIO_MONITOR_ACQUIRE_MAX)
     {
-        ret = (int)msgrcv( qid, 
-                           (void *)&cbi,
-                           sizeof(cbi.index),
-                           SQ_LIO_NORMAL_MSG,
-                           IPC_NOWAIT
-                         );
-        if (ret == sizeof(cbi.index))
+        while (!done)
         {
-            SharedMsgDef *shm;
-            shm = (SharedMsgDef *)(clientBuffers+sizeof(SharedMemHdr)
-                                               +(cbi.index*sizeof(SharedMsgDef)));
-            memset( &shm->trailer, 0, sizeof( shm->trailer ) );
-            shm->trailer.index = cbi.index;
-            shm->trailer.OSPid = pid;
-            shm->trailer.verifier = verifier;
-            shm->trailer.bufInUse = getpid();
-            clock_gettime(CLOCK_REALTIME, &shm->trailer.timestamp);
-            msg = &shm->msg;
-            // Increment acquiredBufferCount.  Use atomic operation due
-            // to multi-threaded access.
-            __sync_fetch_and_add( &acquiredBufferCount, 1 );
-
-            // Record statistics (sonar counters)
-            if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
-               MonStats->LocalIOBuffersIncr();
-            if (acquiredBufferCount > acquiredBufferCountMax) {
+            ret = (int)msgrcv( qid
+                             , (void *)&cbi
+                             , sizeof(cbi.index)
+                             , SQ_LIO_NORMAL_MSG
+                             , IPC_NOWAIT );
+            if (ret == sizeof(cbi.index))
+            {
+                SharedMsgDef *shm;
+                shm = (SharedMsgDef *)(clientBuffers+sizeof(SharedMemHdr)
+                                                   +(cbi.index*sizeof(SharedMsgDef)));
+                memset( &shm->trailer, 0, sizeof( shm->trailer ) );
+                shm->trailer.index = cbi.index;
+                shm->trailer.OSPid = pid;
+                shm->trailer.verifier = verifier;
+                shm->trailer.bufInUse = getpid();
+                clock_gettime(CLOCK_REALTIME, &shm->trailer.timestamp);
+                msg = &shm->msg;
+                // Increment acquiredBufferCount.  Use atomic operation due
+                // to multi-threaded access.
+                __sync_fetch_and_add( &acquiredBufferCount, 1 );
+    
+                // Record statistics (sonar counters)
                 if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
-                   MonStats->LocalIOBuffersMaxSet(acquiredBufferCount);
+                   MonStats->LocalIOBuffersIncr();
+                if (acquiredBufferCount > acquiredBufferCountMax) {
+                    if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
+                       MonStats->LocalIOBuffersMaxSet(acquiredBufferCount);
+                }
+    
+                acquiredBufferCountMax = acquiredBufferCount > acquiredBufferCountMax ? acquiredBufferCount : acquiredBufferCountMax;
+                if (trace_settings & TRACE_MLIO_DETAIL)
+                  trace_printf("%s@%d" " dequeued shared buffer, idx="  "%d" "\n", method_name, __LINE__, cbi.index);
+                done = true;
             }
-
-            acquiredBufferCountMax = acquiredBufferCount > acquiredBufferCountMax ? acquiredBufferCount : acquiredBufferCountMax;
-            if (trace_settings & TRACE_MLIO_DETAIL)
-              trace_printf("%s@%d" " dequeued shared buffer, idx="  "%d" "\n", method_name, __LINE__, cbi.index);
-        }
-        else
-        {  // msgrcv error
-            char la_buf[MON_STRING_BUF_SIZE];
-            sprintf(la_buf, "[%s], msgrcv error %s (%d)\n", method_name, strerror(errno), errno);
-            mon_log_write(MON_MLIO_ACQUIRE_MSG_1, SQ_LOG_ERR, la_buf);
+            else if (ret == -1  && errno != ENOMSG)
+            {  // msgrcv error
+                char la_buf[MON_STRING_BUF_SIZE];
+                sprintf(la_buf, "[%s], msgrcv error %s (%d)\n", method_name, strerror(errno), errno);
+                mon_log_write(MON_MLIO_ACQUIRE_MSG_1, SQ_LOG_ERR, la_buf);
+                done = true;
+            }
+            if (!done)
+            {
+                usleep(10000); // sleep 10ms and try again
+                if (trace_settings & TRACE_MLIO)
+                {
+                    trace_printf( "%s@%d" " No message buffer!\n"
+                                , method_name, __LINE__);
+                }
+            }
         }
     }
     else
diff --git a/core/sqf/monitor/linux/monitor.cxx b/core/sqf/monitor/linux/monitor.cxx
old mode 100755
new mode 100644
index 6638c4f..4ed4db6
--- a/core/sqf/monitor/linux/monitor.cxx
+++ b/core/sqf/monitor/linux/monitor.cxx
@@ -51,7 +51,6 @@
 #include "nameserverconfig.h"
 #include "lnode.h"
 #include "pnode.h"
-#include "tmsync.h"
 #include "cluster.h"
 #include "monitor.h"
 #include "props.h"
@@ -128,6 +127,7 @@
 bool IsRealCluster = true;
 bool IsAgentMode = false;
 bool IsNameServer = false;
+AgentType_t AgentType = AgentType_Undefined;
 bool IsMaster = false;
 bool IsMPIChild = false;
 char MasterMonitorName[MAX_PROCESS_PATH]= {'\0'};
@@ -140,6 +140,8 @@
 #ifndef NAMESERVER_PROCESS
 bool NameServerEnabled = false;
 #endif
+int  ClusterId = -1;
+int  InstanceId = -1;
 
 // Lock to manage memory modifications during fork/exec
 CLock MemModLock;
@@ -254,6 +256,32 @@
 }
 
 #ifndef NAMESERVER_PROCESS
+void sigtermSignalHandler(int signal, siginfo_t *info, void *)
+{
+    const char method_name[] = "sigtermSignalHandler";
+
+    if (trace_settings & TRACE_ENTRY_EXIT)
+       trace_nolock_printf("%s@%d\n", method_name, __LINE__);
+
+    if (trace_settings & TRACE_SIG_HANDLER)
+        trace_nolock_printf("%s@%d - signal=%d, code=%d, status=%d, pid=%d\n",
+                            method_name, __LINE__, signal, info->si_code,
+                            info->si_status, info->si_pid);
+
+    char la_buf[MON_STRING_BUF_SIZE*2];
+    snprintf( la_buf, sizeof(la_buf)
+            , "[%s], Initiating node down on Node %s, pnid=%d (received SIGTERM signal)\n"
+            , method_name
+            , MyNode->GetName()
+            , MyPNID );
+    mon_log_write(MON_MONITOR_SIGTERMSIGNALHANDLER_1, SQ_LOG_CRIT, la_buf); 
+
+    Monitor->HardNodeDown( MyPNID, true );
+
+    if (trace_settings & TRACE_ENTRY_EXIT)
+        trace_nolock_printf("%s@%d - Exit\n", method_name, __LINE__);
+}
+
 void child_death_signal_handler2 (int signal, siginfo_t *info, void *)
 {
     pid_t pid;
@@ -350,6 +378,30 @@
     malloc_stats();
 }
 
+const char *AgentTypeString( AgentType_t agentType)
+{
+    const char *str;
+    
+    switch( agentType )
+    {
+        case AgentType_Ambari:
+            str = "Ambari";
+            break;
+        case AgentType_CM:
+            str = "CM";
+            break;
+        case AgentType_MPI:
+            str = "MPI";
+            break;
+        default:
+            str = "Undefined";
+            break;
+    }
+
+    return( str );
+}
+
+
 const char *CommTypeString( CommType_t commType)
 {
     const char *str;
@@ -376,7 +428,7 @@
     : CCluster (),
 #else
 CMonitor::CMonitor (int procTermSig)
-    : CTmSync_Container (),
+    : CCluster (),
 #endif
       OpenCount (0)
     , NoticeCount (0)
@@ -457,9 +509,16 @@
         PidMap = true;
     }
 
-    snprintf( fname, sizeof(fname), "%s/monitor.map.%d.%s",
-             getenv("MPI_TMPDIR"), MyPNID, Node_name );
-    remove(fname);
+    if ( IsRealCluster )
+    {
+        snprintf( fname, sizeof(fname), "%s/monitor.map.%s",
+                 getenv("TRAF_LOG"), Node_name );
+    }
+    else
+    {
+        snprintf( fname, sizeof(fname), "%s/monitor.map.%d.%s",
+                 getenv("TRAF_LOG"), MyPNID, Node_name );
+    }
     processMapFd = open(fname, O_WRONLY | O_APPEND | O_CREAT,
                         S_IRUSR | S_IWUSR );
     if ( processMapFd == -1 )
@@ -1074,29 +1133,143 @@
 }
 #endif
 
-void HandleAssignMonitorLeader ( const char *failedMaster )
-{
-    const char method_name[] = "HandleAssignMonitorLeader";
-    TRACE_ENTRY;
-    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-    {
-        trace_printf("%s@%d HandleAssignMonitorLeader called for %s\n"
-                            , method_name, __LINE__, failedMaster);
-    }
-    // only relevant in AgentMode
-     if (IsAgentMode)
-     {
-         Monitor->AssignMonitorLeader(failedMaster);
-     }
-    
-    TRACE_EXIT;
-}
-
 void HandleMyNodeExpiration( void )
 {
     const char method_name[] = "HandleMyNodeExpiration";
     TRACE_ENTRY;
-    ReqQueue.enqueueDownReq(MyPNID);
+    if (ZClientEnabled )
+    {
+        if (ZClient->StateGet() == CZClient::ZC_SHUTDOWN)
+        {
+            return;
+        }
+
+        if (!MyNode->IsPendingNodeDown())
+        {
+            ReqQueue.enqueueDownReq(MyPNID);
+        }
+    }
+    TRACE_EXIT;
+}
+
+void HandleNodeChange( const char *nodeName )
+{
+    const char method_name[] = "HandleNodeChange";
+    TRACE_ENTRY;
+
+    if (ZClientEnabled )
+    {
+        if (ZClient->StateGet() == CZClient::ZC_SHUTDOWN)
+        {
+            return;
+        }
+
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Handling CHANGE for node=%s\n"
+                        , method_name, __LINE__
+                        , nodeName );
+        }
+    }
+
+    TRACE_EXIT;
+}
+
+void HandleNodeConfigurationChange( void )
+{
+    const char method_name[] = "HandleNodeConfigurationChange";
+    TRACE_ENTRY;
+
+    if (ZClientEnabled )
+    {
+        if (ZClient->StateGet() == CZClient::ZC_SHUTDOWN)
+        {
+            return;
+        }
+
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Handling CONFIGURATION CHANGE\n"
+                        , method_name, __LINE__ );
+        }
+    }
+
+    TRACE_EXIT;
+}
+
+void HandleNodeError( const char *nodeName )
+{
+    const char method_name[] = "HandleNodeError";
+    TRACE_ENTRY;
+
+    if (ZClientEnabled )
+    {
+        if (ZClient->StateGet() == CZClient::ZC_SHUTDOWN)
+        {
+            return;
+        }
+
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Handling ERROR for node=%s\n"
+                        , method_name, __LINE__
+                        , nodeName );
+        }
+
+        CNode *node = Nodes->GetNode((char *)nodeName);
+        if (node)
+        {
+            if (node->GetState() != State_Down)
+            {
+                node->SetPendingNodeDown(true);
+            }
+            ReqQueue.enqueueDownReq(node->GetPNid());
+        }
+    }
+    TRACE_EXIT;
+}
+
+void HandleNodeChild( const char *nodeName )
+{
+    const char method_name[] = "HandleNodeChild";
+    TRACE_ENTRY;
+
+    if (ZClientEnabled )
+    {
+        if (ZClient->StateGet() == CZClient::ZC_SHUTDOWN)
+        {
+            return;
+        }
+
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Handling CHILD for node=%s\n"
+                        , method_name, __LINE__
+                        , nodeName );
+        }
+    }
+    TRACE_EXIT;
+}
+
+void HandleNodeCreated( const char *nodeName )
+{
+    const char method_name[] = "HandleNodeCreated";
+    TRACE_ENTRY;
+
+    if (ZClientEnabled )
+    {
+        if (ZClient->StateGet() == CZClient::ZC_SHUTDOWN)
+        {
+            return;
+        }
+
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Handling CREATE for node=%s\n"
+                        , method_name, __LINE__
+                        , nodeName );
+        }
+    }
     TRACE_EXIT;
 }
 
@@ -1104,11 +1277,32 @@
 {
     const char method_name[] = "HandleNodeExpiration";
     TRACE_ENTRY;
-    CNode *node = Nodes->GetNode((char *)nodeName);
-    if (node)
+
+    if (ZClientEnabled )
     {
-        ReqQueue.enqueueDownReq(node->GetPNid());
+        if (ZClient->StateGet() == CZClient::ZC_SHUTDOWN)
+        {
+            return;
+        }
+
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Handling EXPIRATION of node=%s\n"
+                        , method_name, __LINE__
+                        , nodeName );
+        }
+
+        CNode *node = Nodes->GetNode((char *)nodeName);
+        if (node)
+        {
+            if (node->GetState() != State_Down)
+            {
+                node->SetPendingNodeDown(true);
+            }
+            ReqQueue.enqueueDownReq(node->GetPNid());
+        }
     }
+
     TRACE_EXIT;
 }
 
@@ -1120,6 +1314,8 @@
     if ( ZClientEnabled )
     {
         string       hostName;
+        string       instanceId;
+        string       trafodionRootZNode;
         string       zkQuorumHosts;
         stringstream zkQuorumPort;
         char *env;
@@ -1191,17 +1387,52 @@
             }
         }
 
+        env = getenv("TRAF_ROOT_ZNODE");
+        if ( env )
+        {
+            stringstream ss;
+            ss.str( "" );
+            ss << env;
+            trafodionRootZNode = ss.str();
+        }
+        else
+        {
+            char la_buf[MON_STRING_BUF_SIZE];
+            sprintf(la_buf, "[%s], Environment variable TRAF_ROOT_ZNODE is undefined, exiting!\n", method_name);
+            mon_log_write(MON_MONITOR_CREATEZCLIENT_3, SQ_LOG_CRIT, la_buf);
+ 
+            printf( "%s", la_buf);
+            exit(EXIT_FAILURE);
+        }
+
+        if (InstanceId != -1)
+        {
+            stringstream ss;
+            ss.str( "" );
+            ss << "/" << InstanceId;
+            instanceId = ss.str();
+        }
+        else
+        {
+            char la_buf[MON_STRING_BUF_SIZE];
+            sprintf(la_buf, "[%s], Instance Id is undefined, exiting!\n", method_name);
+            mon_log_write(MON_MONITOR_CREATEZCLIENT_4, SQ_LOG_CRIT, la_buf);
+ 
+            printf( "%s", la_buf);
+            exit(EXIT_FAILURE);
+        }
+
         ZClient = new CZClient( zkQuorumPort.str().c_str()
-                              , ZCLIENT_TRAFODION_ZNODE
-                              , ZCLIENT_INSTANCE_ZNODE );
+                              , trafodionRootZNode.c_str()
+                              , instanceId.c_str() );
         if ( ZClient == NULL )
         {
             char buf[MON_STRING_BUF_SIZE];
             snprintf(buf, sizeof(buf),
                      "[%s], Failed to allocate ZClient object!\n"
                     , method_name);
-            mon_log_write(MON_MONITOR_CREATEZCLIENT_3, SQ_LOG_CRIT, buf);
-            abort();
+            mon_log_write(MON_MONITOR_CREATEZCLIENT_5, SQ_LOG_CRIT, buf);
+            mon_failure_exit();
         }
     }
 
@@ -1281,12 +1512,14 @@
 {
     int i;
     int rc;
+    int configMasterPNid = -1;
     bool done = false;
     char *env;
     char *nodename = NULL;
     char fname[MAX_PROCESS_PATH];
     char short_node_name[MPI_MAX_PROCESSOR_NAME];
 #ifndef NAMESERVER_PROCESS
+    int portFileOpenDelay = DEFAULT_PORTFILEOPEN_DELAY;
     char port_fname[MAX_PROCESS_PATH];
     char temp_fname[MAX_PROCESS_PATH];
 #endif
@@ -1311,7 +1544,7 @@
 
     if (argc < 2) {
       printf("error: monitor needs an argument...exitting...\n");
-      exit(0);
+      exit(EXIT_FAILURE);
     }
 
     int lv_arg_index = 1;
@@ -1404,14 +1637,6 @@
         }
     }
 
-#ifdef NAMESERVER_PROCESS
-    MonLog = new CMonLog( "log4cxx.monitor.trafns.config", "NS", "alt.mon", -1, -1, getpid(), "$TNS" );
-#else
-    MonLog = new CMonLog( "log4cxx.monitor.mon.config", "MON", "alt.mon", -1, -1, getpid(), "$MONITOR" );
-#endif
-
-    MonLog->setupInMemoryLog();
-
 #ifdef MULTI_TRACE_FILES
     initVariableKey();
 #endif
@@ -1441,14 +1666,14 @@
     {
         Measure = 1;
         snprintf(fname, sizeof(fname), "%s/monitor.P%d",
-                 getenv("MPI_TMPDIR"),getpid());
+                 getenv("TRAF_LOG"),getpid());
         setenv("MPI_INSTR", fname, 1);
     }
     if ( env && *env == '2' )
     {
         Measure = 2;
         snprintf(fname, sizeof(fname), "%s/monitor.cpu.P%d:cpu",
-                 getenv("MPI_TMPDIR"),getpid());
+                 getenv("TRAF_LOG"),getpid());
         setenv("MPI_INSTR", fname, 1);
     }
 
@@ -1462,18 +1687,68 @@
         CommType = CommType_Sockets;
     }
 
+    env = getenv("TRAF_AGENT");
+    if ( env != NULL && strcmp(env, "CM") == 0 )
+    {
+        AgentType = AgentType_CM;
+    }
+    else if ( env != NULL && strcmp(env, "Ambari") == 0 )
+    {
+        AgentType = AgentType_Ambari;
+    }
+    else
+    {
+        AgentType = AgentType_MPI;
+    }
+
+#ifdef NAMESERVER_PROCESS
+    env = getenv("NS_GENERATE_CORE_ON_FAILURE_EXIT");
+#else
+    env = getenv("MON_GENERATE_CORE_ON_FAILURE_EXIT");
+#endif
+    if ( env )
+    {
+        if ( *env == '0' )
+        {
+            GenCoreOnFailureExit = false;
+        }
+        else
+        {
+            GenCoreOnFailureExit = true;
+        }
+    }
+
+#ifndef NAMESERVER_PROCESS
+    if (AgentType == AgentType_CM)
+    {
+        // Set sigaction such that SIGTERM signal is caught.
+        // In a Cloudera Manager (CM) environment, the supervisord
+        // will send the monitor (NODE ROLE) a SIGTERM to indicate
+        // is time to shutdown the instance
+        struct sigaction act;
+        act.sa_sigaction = sigtermSignalHandler;
+        act.sa_flags = SA_SIGINFO | SA_NODEFER;
+        sigemptyset (&act.sa_mask);
+        sigaddset (&act.sa_mask, SIGTERM);
+        sigaction (SIGTERM, &act, NULL);
+    }
+#endif
+
     // Mask all allowed signals
     sigset_t              mask;
     sigfillset(&mask);
     sigdelset(&mask, SIGPROF); // allows profiling such as google profiler
     sigdelset(&mask, SIGUSR2);
+#ifndef NAMESERVER_PROCESS
+    sigdelset(&mask, SIGTERM);
+#endif
     rc = pthread_sigmask(SIG_SETMASK, &mask, NULL);
     if (rc != 0)
     {
-        char buf[MON_STRING_BUF_SIZE];
-        snprintf(buf, sizeof(buf), "[%s], pthread_sigmask error=%d\n",
-                 method_name, rc);
-        mon_log_write(MON_MONITOR_MAIN_1, SQ_LOG_ERR, buf);
+        printf( "[%s],  Unable to set signal mask, "
+                "pthread_sigmask() error=%d (%s)\n"
+               ,method_name, rc, strerror(rc) );
+        exit(EXIT_FAILURE);
     }
 
     // Setup HP_MPI software license
@@ -1535,6 +1810,20 @@
     }
 
 #ifndef NAMESERVER_PROCESS
+    env = getenv("MON_PORT_OPEN_FILE_DELAY");
+    if ( env && isdigit(*env) )
+    {
+        portFileOpenDelay = atoi(env);
+        if (portFileOpenDelay < MIN_PORTFILEOPEN_DELAY)
+        {
+            portFileOpenDelay = MIN_PORTFILEOPEN_DELAY;
+        }
+        else if (portFileOpenDelay > MAX_PORTFILEOPEN_DELAY)
+        {
+            portFileOpenDelay = MAX_PORTFILEOPEN_DELAY;
+        }
+    }
+
     // We need to delay some to make sure all monitor processes have initialized before
     // any monitor tries to perform an Allgather operation.
     sleep( initSleepTime );
@@ -1546,8 +1835,6 @@
     MPI_Comm_rank (MPI_COMM_WORLD, &MyPNID);
 #endif
 
-    MonLog->setPNid( MyPNID );
-
     gethostname(Node_name, MPI_MAX_PROCESSOR_NAME);
     char *tmpptr = Node_name;
     while ( *tmpptr )
@@ -1582,6 +1869,14 @@
 #endif
 
 #ifdef NAMESERVER_PROCESS
+    MonLog = new CMonLog( "log4cxx.monitor.trafns.config", "NS", "alt.mon", MyPNID, -1, getpid(), "$TNS" );
+#else
+    MonLog = new CMonLog( "log4cxx.monitor.mon.config", "MON", "alt.mon", MyPNID, -1, getpid(), "$MONITOR" );
+#endif
+    MonLog->setPNid( MyPNID );
+    MonLog->setupInMemoryLog();
+
+#ifdef NAMESERVER_PROCESS
     // Without mpi daemon the monitor has no default standard output.
     // We create a standard output file here.
     if ( IsRealCluster )
@@ -1608,8 +1903,7 @@
                  getenv("TRAF_LOG"), MyPNID, Node_name);
     }
 #endif
-    remove(fname);
-    if( freopen (fname, "w", stdout) == NULL )
+    if( freopen(fname, "a", stdout) == NULL )
     {
         char buf[MON_STRING_BUF_SIZE];
         snprintf(buf, sizeof(buf), "[%s], can't open stdout (%s).\n",
@@ -1640,7 +1934,7 @@
         default:
             printf( "SQ_IC contains invalid communication protocol: %s\n"
                    , CommTypeString(CommType));
-            abort();
+            exit(EXIT_FAILURE);
     }
 
     if ((!IsAgentMode) && (argc > 3 && strcmp (argv[2], "-integrate") == 0))
@@ -1656,7 +1950,7 @@
                 else
                 {
                     printf ( "Invalid integrating monitor MPI port: %s\n", argv[3]);
-                    abort();
+                    exit(EXIT_FAILURE);
                 }
                 break;
             case CommType_Sockets:
@@ -1665,25 +1959,13 @@
                     // In agent mode and when re-integrating (node up), all
                     // monitors processes start as a cluster of 1 and join to the
                     // creator monitor to establish the real cluster.
-                    // Therefore, MyPNID will always be zero then it is
-                    // necessary to use the node name to obtain the correct
-                    // <pnid> from the configuration which occurs when creating the
-                    // CMonitor object down below. By setting MyPNID to -1, when the
-                    // CCluster::InitializeConfigCluster() invoked during the creation
-                    // of the CMonitor object it will set MyPNID using Node_name.
-#ifdef NAMESERVER_PROCESS
-                    if ( IsRealCluster )
-                        MyPNID = -1;
-#else
-                    MyPNID = -1;
-#endif
                     SMSIntegrating = IAmIntegrating = true;
                     strcpy( IntegratingMonitorPort, argv[3] );
                 }
                 else
                 {
                     printf ( "Invalid integrating monitor socket port: %s\n", argv[3]);
-                    abort();
+                    exit(EXIT_FAILURE);
                 }
                 break;
             default:
@@ -1698,7 +1980,7 @@
         else
         {
             printf ( "Invalid creator shell pid: %s\n", argv[4]);
-            abort();
+            exit(EXIT_FAILURE);
         }
         if ( isdigit (*argv[5]) )
         {
@@ -1708,7 +1990,7 @@
         else
         {
             printf ( "Invalid creator shell verifier: %s\n", argv[5]);
-            abort();
+            exit(EXIT_FAILURE);
         }
 
         // Trace cannot be specified on startup command but need to
@@ -1717,12 +1999,18 @@
 
     }
 
+    if ( IsRealCluster )
+    {
+        // Set MyPNID to -1 to use the node name to obtain the correct
+        // <pnid> from the configuration which occurs when creating the
+        // CMonitor object down below. By setting MyPNID to -1, when the
+        // CCluster::InitializeConfigCluster() invoked during the creation
+        // of the CMonitor object it will set MyPNID using Node_name.
+        MyPNID = -1;
+    }
+
     if (IsAgentMode)
     {
-        if ( IsRealCluster )
-        {
-            MyPNID = -1;
-        }
         CreatorShellPid = 1000; // per monitor.sh
         CreatorShellVerifier = 0;
     }
@@ -1817,22 +2105,28 @@
     if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
        MonStats->MonitorBusyIncr();
 
+    const char message_tag[] = "Trafodion";
+    snprintf( buf, sizeof(buf), "[%s] - monitor Started!\n", message_tag );
+    mon_log_write(MON_MONITOR_MAIN_3, SQ_LOG_INFO, buf);
 #ifdef NAMESERVER_PROCESS
-    snprintf(buf, sizeof(buf),
-                 "[CMonitor::main], %s, Started! CommType: %s\n"
+    snprintf(buf, sizeof(buf), "[%s] - %s, Started! CommType: %s\n"
+                , message_tag
                 , CALL_COMP_GETVERS2(trafns), CommTypeString( CommType ));
 #else
-    snprintf(buf, sizeof(buf),
-                 "[CMonitor::main], %s, Started! CommType: %s (%s%s%s%s)\n"
+    snprintf(buf, sizeof(buf), "[%s] - %s, Started! CommType: %s (%s%s%s%s%s)\n"
+                , message_tag
                 , CALL_COMP_GETVERS2(monitor)
                 , CommTypeString( CommType )
                 , IsRealCluster?"RealCluster":"VirtualCluster"
-                , IsAgentMode?"/AgentMode":""
+                , IsAgentMode?"/AgentMode:":""
+                , IsAgentMode?AgentTypeString( AgentType ):""
                 , IsMPIChild?"/MPIChild":""
                 , NameServerEnabled?"/NameServerEnabled":"" );
 #endif
     mon_log_write(MON_MONITOR_MAIN_3, SQ_LOG_INFO, buf);
-
+    snprintf( buf, sizeof(buf), "[%s] - monitor Started!\n", message_tag );
+    mon_log_write(MON_MONITOR_MAIN_3, SQ_LOG_INFO, buf);
+       
 #ifdef DMALLOC
     if (trace_settings & TRACE_INIT)
        trace_printf("%s@%d" "DMALLOC Option set" "\n", method_name, __LINE__);
@@ -1867,7 +2161,7 @@
                      sprintf(la_buf, "[%s], Failed to load cluster configuration.\n", method_name);
                      mon_log_write(MON_MONITOR_MAIN_12, SQ_LOG_CRIT, la_buf);
 
-                     abort();
+                     mon_failure_exit();
                 }
             }
             else
@@ -1876,7 +2170,7 @@
                 sprintf(la_buf, "[%s], Failed to open cluster configuration.\n", method_name);
                 mon_log_write(MON_MONITOR_MAIN_13, SQ_LOG_CRIT, la_buf);
 
-                abort();
+                mon_failure_exit();
             }
         }
        else
@@ -1885,7 +2179,7 @@
            sprintf(la_buf, "[%s], Failed to allocate cluster configuration.\n", method_name);
            mon_log_write(MON_MONITOR_MAIN_14, SQ_LOG_CRIT, la_buf);
 
-           abort();
+           mon_failure_exit();
         }
 
         //Moved creation of the below to later on
@@ -1893,6 +2187,8 @@
         //Config = new CConfigContainer ();
         //Monitor = new CMonitor (procTermSig);
         
+        ClusterId = ClusterConfig->GetClusterId();
+        InstanceId = ClusterConfig->GetInstanceId();
 
         // Set up zookeeper and determine the master
         if ( IsAgentMode || IsRealCluster )
@@ -1923,10 +2219,11 @@
 
         if (IsAgentMode)
         {
+            configMasterPNid = ClusterConfig->GetConfigMaster();
             if ((ZClientEnabled) && (ZClient != NULL))
             {
                 // Do not wait, just see if one exists
-                const char *masterMonitor = ZClient->WaitForAndReturnMaster(false);
+                const char *masterMonitor = ZClient->MasterWaitForAndReturn(false);
 
                 if (masterMonitor)
                 {
@@ -1934,9 +2231,11 @@
 
                     if (trace_settings & TRACE_INIT)
                     {
-                        trace_printf("%s@%d (MasterMonitor) IsAgentMode = TRUE, masterMonitor from ZK: %s, Node_name: %s\n"
+                        trace_printf("%s@%d (MasterMonitor) IsAgentMode=%s, IAmIntegrating=%s, masterMonitor from ZK: %s, Node_name: %s\n"
                                      , method_name
                                      , __LINE__
+                                     , IsAgentMode?"TRUE":"FALSE"
+                                     , IAmIntegrating?"TRUE":"FALSE"
                                      , MasterMonitorName
                                      , Node_name);
                     }
@@ -1958,9 +2257,11 @@
 
                     if (trace_settings & TRACE_INIT)
                     {
-                        trace_printf("%s@%d (MasterMonitor) IsAgentMode = TRUE, ConfigMasterMonitor: %s, Node_name:%s \n"
+                        trace_printf("%s@%d (MasterMonitor) IsAgentMode=%s, IAmIntegrating=%s, ConfigMasterMonitor: %s, Node_name:%s \n"
                                      , method_name
                                      , __LINE__
+                                     , IsAgentMode?"TRUE":"FALSE"
+                                     , IAmIntegrating?"TRUE":"FALSE"
                                      , MasterMonitorName
                                      , Node_name);
                     }
@@ -1982,9 +2283,11 @@
 
                 if (trace_settings & TRACE_INIT)
                 {
-                    trace_printf("%s@%d (MasterMonitor) IsAgentMode = TRUE, ConfigMasterMonitor: %s, Node_name:%s \n"
+                    trace_printf("%s@%d (MasterMonitor) IsAgentMode=%s, IAmIntegrating=%s, ConfigMasterMonitor: %s, Node_name:%s \n"
                                  , method_name
                                  , __LINE__
+                                 , IsAgentMode?"TRUE":"FALSE"
+                                 , IAmIntegrating?"TRUE":"FALSE"
                                  , MasterMonitorName
                                  , Node_name);
                 }
@@ -2012,14 +2315,6 @@
          {
             if (!IsMaster)
             {
-#ifdef NAMESERVER_PROCESS
-                if ( IsRealCluster )
-                {
-                    MyPNID = -1;
-                }
-#else
-                MyPNID = -1;
-#endif
                 SMSIntegrating = IAmIntegrating = true;
 #ifdef NAMESERVER_PROCESS
                 char *monitorPort = getenv ("NS_COMM_PORT");
@@ -2047,20 +2342,22 @@
                 }
                 if (trace_settings & TRACE_INIT)
                 {
-                    trace_printf( "%s@%d (MasterMonitor) IsAgentMode = TRUE, I am NOT the master, "
-                                  "MyPNID=%d, master port=%s\n"
+                    trace_printf( "%s@%d (MasterMonitor) IsAgentMode=%s, IAmIntegrating=%s, "
+                                  "I am NOT the master, MyPNID=%d, master port=%s\n"
                                 , method_name, __LINE__
+                                , IsAgentMode?"TRUE":"FALSE"
+                                , IAmIntegrating?"TRUE":"FALSE"
                                 , MyPNID, IntegratingMonitorPort );
                 }
             }
             else
             {
+                IAmIntegrating = false;
                 if (trace_settings & TRACE_INIT)
                 {
-                    trace_printf( "%s@%d (MasterMonitor) IsAgentMode = TRUE, I am the master, MyPNID=%d\n"
-                                , method_name, __LINE__, MyPNID );
+                    trace_printf( "%s@%d (MasterMonitor) IsAgentMode=%s, IAmIntegrating=%s, I AM the master, MyPNID=%d\n"
+                                , method_name, __LINE__, IsAgentMode?"TRUE":"FALSE", IAmIntegrating?"TRUE":"FALSE", MyPNID );
                 }
-                IAmIntegrating = false;
             }
         }
 
@@ -2101,78 +2398,121 @@
             {
                 char la_buf[MON_STRING_BUF_SIZE];
                 sprintf( la_buf
-                       , "[%s], Failed to get my Node, MyPNID=%d\n"
+                       , "[%s], Failed to get myNode object, MyPNID=%d\n"
                        , method_name, MyPNID );
                 mon_log_write(MON_MONITOR_MAIN_15, SQ_LOG_CRIT, la_buf);
 
-                abort();
+                mon_failure_exit();
             }
-
             if ((ZClientEnabled) && (ZClient != NULL))
             {
+                bool newMasterSelected = false;
+retryMaster:
                 CNode *masterNode = Nodes->GetNode(MasterMonitorName);
                 if (!masterNode)
                 {
                     if (trace_settings & TRACE_INIT)
                     {
-                          trace_printf("%s@%d (MasterMonitor) IsMaster=%d, masterNode is NULL, with MasterMonitorName %s\n", method_name, __LINE__, IsMaster, MasterMonitorName);
+                        trace_printf("%s@%d (MasterMonitor) IsMaster=%d, IAmIntegrating=%s, masterNode is NULL, with MasterMonitorName %s\n"
+                                    , method_name, __LINE__, IsMaster, IAmIntegrating?"TRUE":"FALSE", MasterMonitorName);
                     }
                     char la_buf[MON_STRING_BUF_SIZE];
                     sprintf(la_buf, "[%s], Failed to get my Master Node.\n", method_name);
                     mon_log_write(MON_MONITOR_MAIN_16, SQ_LOG_CRIT, la_buf);
 
-                    abort();
+                    mon_failure_exit();
                 }
                 else
                 {
                     if (trace_settings & TRACE_INIT)
                     {
-                          trace_printf("%s@%d (MasterMonitor) IsMaster=%d, masterNode=%s\n", method_name, __LINE__, IsMaster, masterNode->GetName() );
+                          trace_printf("%s@%d (MasterMonitor) IsMaster=%d, IAmIntegrating=%s, masterNode=%s\n"
+                                      , method_name, __LINE__, IsMaster, IAmIntegrating?"TRUE":"FALSE", masterNode->GetName() );
                     }
                 }
                 monitorLead = masterNode->GetPNid();
                 if (MyPNID == monitorLead)
                 {
-                     ZClient->WatchNodeMasterDelete (myNode->GetName() ); // just in case of stale info
-                     ZClient->CreateMasterZNode ( myNode->GetName() );
+                     ZClient->MasterZNodeDelete( myNode->GetName() ); // just in case of stale info
+                     ZClient->MasterZNodeCreate( myNode->GetName() );
                      strcpy (MasterMonitorName, myNode->GetName());
-                     ZClient->WatchMasterNode( MasterMonitorName );
+                     if (newMasterSelected)
+                     {
+                        newMasterSelected = false;
+                        IsMaster = true;
+                        IAmIntegrating = false;
+                        // This monitor is now the master, therefore load the 
+                        // logical node (lnodes) as per the static configuration
+                        Monitor->InitializeConfigCluster( monitorLead );
+                     }
                      if (trace_settings & TRACE_INIT)
                      {
-                         trace_printf("%s@%d (MasterMonitor) IsMaster=%d, set monitor lead to %d\n", method_name, __LINE__, IsMaster, MyPNID);
+                         trace_printf("%s@%d (MasterMonitor) IsMaster=%d, IAmIntegrating=%s, master monitor pnid=%d\n"
+                                     , method_name, __LINE__, IsMaster, IAmIntegrating?"TRUE":"FALSE", MyPNID);
                      }
                  }
                  else
                  {
-                     masterMonitor = ZClient->WaitForAndReturnMaster(true);
+                     masterMonitor = ZClient->MasterWaitForAndReturn(true);
                      CNode *masterNode = NULL;
                      if (masterMonitor)
                      {
                          strcpy (MasterMonitorName, masterMonitor);
                          masterNode = Nodes->GetNode(MasterMonitorName);
+                        if (newMasterSelected)
+                        {
+                            newMasterSelected = false;
+                        }
                      }
 
                      if (masterNode)
                      {
                           if (trace_settings & TRACE_INIT)
                           {
-                              trace_printf("%s@%d (MasterMonitor) IsMaster=%d, set monitor lead to %d\n", method_name, __LINE__, IsMaster, masterNode->GetPNid());
+                            trace_printf( "%s@%d (MasterMonitor) IsMaster=%d, IAmIntegrating=%s, "
+                                          "master monitor pnid=%d\n"
+                                        , method_name, __LINE__
+                                        , IsMaster
+                                        , IAmIntegrating?"TRUE":"FALSE"
+                                        , masterNode->GetPNid());
                           }
                           monitorLead = masterNode->GetPNid();
-                          ZClient->WatchMasterNode( MasterMonitorName ); 
+                          if (monitorLead != configMasterPNid)
+                          {
+                            char *commPort = getenv ("MONITOR_COMM_PORT");
+                            if (commPort)
+                            {
+                                strcpy( IntegratingMonitorPort, MasterMonitorName);
+                                strcat( IntegratingMonitorPort, ":");
+                                strcat( IntegratingMonitorPort, commPort);
+                            }
+                          }
                      }
                      else
                      {
                           if (trace_settings & TRACE_INIT)
                           {
-                              trace_printf("%s@%d (MasterMonitor) IsMaster=%d, masterNode is NULL, with MasterMonitorName %s\n", method_name, __LINE__, IsMaster, MasterMonitorName);
+                            trace_printf( "%s@%d (MasterMonitor) IsMaster=%d, IAmIntegrating=%s, "
+                                          "masterNode is NULL, with MasterMonitorName %s\n"
+                                        , method_name, __LINE__
+                                        , IsMaster
+                                        , IAmIntegrating?"TRUE":"FALSE"
+                                        , MasterMonitorName);
                           }
                           char la_buf[MON_STRING_BUF_SIZE];
-                          sprintf(la_buf, "[%s], Failed to get my Master Node.\n", method_name);
-                          mon_log_write(MON_MONITOR_MAIN_17, SQ_LOG_CRIT, la_buf);
+                          sprintf( la_buf, "[%s], Master monitor (%s) is not available, selecting a new master\n"
+                                 , method_name, MasterMonitorName );
+                          mon_log_write(MON_MONITOR_MAIN_17, SQ_LOG_INFO, la_buf);
 
-                          abort();
-                     }
+                          ZClient->MasterZNodeDelete( MasterMonitorName ); // just in case of stale info
+                          CPNodeConfig * pnodeConfig = ClusterConfig->GetNextPNodeConfigByName( MasterMonitorName );
+                          assert( pnodeConfig );
+                          strcpy( MasterMonitorName, pnodeConfig->GetName() );
+
+                          newMasterSelected = true;
+
+                          goto retryMaster;
+                      }
                 }
             }
 #ifdef NAMESERVER_PROCESS
@@ -2184,10 +2524,13 @@
                 }
             }
 #endif
+            CNode *masterNode = Nodes->GetNode(monitorLead);
             char    buf[MON_STRING_BUF_SIZE];
             snprintf( buf, sizeof(buf)
-                           , "[%s], Master Monitor is on node %d\n"
-                           , method_name, monitorLead);
+                           , "[%s], Master Monitor is %s on node %d\n"
+                           , method_name
+                           , masterNode?masterNode->GetName():""
+                           , monitorLead);
             mon_log_write(MON_MONITOR_MAIN_18, SQ_LOG_INFO, buf);
         }
         if (!IAmIntegrating)
@@ -2204,7 +2547,7 @@
             }
             else
             {
-                MPI_Abort(MPI_COMM_SELF,99); // too early to call failsafe node down.
+                mon_failure_exit();
             }
         }
 #endif
@@ -2213,6 +2556,9 @@
         // Create health check thread
         HealthCheck.start();
 
+        // Create request worker threads
+        CReqWorker::startReqWorkers();
+
         // Create thread to accept connections from other monitors
         CommAccept.start();
 #ifdef NAMESERVER_PROCESS
@@ -2254,7 +2600,7 @@
                      method_name, BLOCK_SIZE, err, ErrorMsg(err));
             mon_log_write(MON_MONITOR_MAIN_4, SQ_LOG_CRIT, buf);
 
-            MPI_Abort(MPI_COMM_SELF,99);
+            mon_failure_exit();
         }
 
         memset( (void *)ioBuffer, 0 , BLOCK_SIZE );
@@ -2265,13 +2611,13 @@
         if (IsRealCluster)
         {
             snprintf(port_fname, sizeof(port_fname), "%s/monitor.port.%s",
-                     getenv("MPI_TMPDIR"), short_node_name );
+                     getenv("TRAF_LOG"), short_node_name );
         }
         else
         {
             // Write out our port number so other processes can attach.
             snprintf(port_fname, sizeof(port_fname), "%s/monitor.port.%d.%s",
-                     getenv("MPI_TMPDIR"),MyPNID,Node_name);
+                     getenv("TRAF_LOG"),MyPNID,Node_name);
         }
 #endif
 
@@ -2283,49 +2629,67 @@
         }
 
 #ifndef NAMESERVER_PROCESS
-        // create with no caching, user read/write, group read/write, other read
-        fd = open( port_fname
-                   , O_RDWR | O_TRUNC | O_CREAT | O_DIRECT
-                   , S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
-        if ( fd != -1 )
+        for ( i = 0; i < MAX_PORTFILEOPEN_RETRIES; i++ )
         {
-            snprintf( ioBuffer, BLOCK_SIZE, "%s", MyCommPort );
-            rc = write( fd, ioBuffer, BLOCK_SIZE );
-            if ( rc == -1 )
+            // create with no caching, user read/write, group read/write, other read
+            fd = open( port_fname
+                       , O_RDWR | O_TRUNC | O_CREAT | O_DIRECT | O_SYNC
+                       , S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
+            if ( fd != -1 )
+            {
+                snprintf( ioBuffer, BLOCK_SIZE, "%s", MyCommPort );
+                rc = write( fd, ioBuffer, BLOCK_SIZE );
+                if ( rc == -1 )
+                {
+                    int err = errno;
+                    char buf[MON_STRING_BUF_SIZE];
+                    snprintf(buf, sizeof(buf), "[%s], can't write %d bytes to "
+                             "port file (%s), error=%d(%s)\n",
+                             method_name, BLOCK_SIZE, port_fname, err,
+                             ErrorMsg(err));
+                    mon_log_write(MON_MONITOR_MAIN_5, SQ_LOG_CRIT, buf);
+    
+                    if ( IAmIntegrating )
+                    {
+                        // This monitor is reintegrating into cluster.  Inform
+                        // creator monitor of error, then abort.
+                        Monitor->ReIntegrate( CCluster::Reintegrate_Err10 );
+                    }
+                    else
+                    {
+                        mon_failure_exit();
+                    }
+                }
+                close( fd );
+                if (trace_settings & TRACE_INIT)
+                    trace_printf("%s@%d" " Port file created, pnid=%d, port=%s" "\n", method_name, __LINE__, MyPNID, MyCommPort );
+                break;    
+            }
+            else
             {
                 int err = errno;
                 char buf[MON_STRING_BUF_SIZE];
-                snprintf(buf, sizeof(buf), "[%s], can't write %d bytes to "
-                         "port file (%s), Error=%d(%s)\n",
-                         method_name, BLOCK_SIZE, port_fname, err,
-                         ErrorMsg(err));
-                mon_log_write(MON_MONITOR_MAIN_5, SQ_LOG_CRIT, buf);
-
-                if ( IAmIntegrating )
-                    // This monitor is reintegrating into cluster.  Inform
-                    // creator monitor of error, then abort.
-                    Monitor->ReIntegrate( CCluster::Reintegrate_Err10 );
-                else
-                    MPI_Abort(MPI_COMM_SELF,99);
+                snprintf( buf, sizeof(buf)
+                        , "[%s], can't open port file (%s), "
+                          "error=%d (%s), retry=%d, max retries=%d\n"
+                        , method_name, port_fname, err, ErrorMsg(err)
+                        , i, MAX_PORTFILEOPEN_RETRIES );
+                mon_log_write(MON_MONITOR_MAIN_6, SQ_LOG_ERR, buf);
+                sleep(portFileOpenDelay);
             }
-            close( fd );
-            if (trace_settings & TRACE_INIT)
-                trace_printf("%s@%d" " Port file created, pnid=%d, port=%s" "\n", method_name, __LINE__, MyPNID, MyCommPort );
         }
-        else
+        if ( fd == -1 )
         {
-            char buf[MON_STRING_BUF_SIZE];
-            snprintf(buf, sizeof(buf), "[%s], can't open port file (%s), "
-                     "Error= %s\n", method_name, port_fname,
-                     ErrorMsg(errno));
-            mon_log_write(MON_MONITOR_MAIN_6, SQ_LOG_CRIT, buf);
-
             if ( IAmIntegrating )
+            {
                 // This monitor is reintegrating into cluster.  Inform
                 // creator monitor of error, then abort.
                 Monitor->ReIntegrate( CCluster::Reintegrate_Err11 );
+            }
             else
-                MPI_Abort(MPI_COMM_SELF,99);
+            {
+                mon_failure_exit();
+            }
         }
         free( ioBuffer );
         int ret = SQ_theLocalIOToClient->initWorker();
@@ -2363,7 +2727,7 @@
             MyNode->SetPhase( Phase_Activating );
 
             if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-                trace_printf("%s@%d" " After UpdateCluster" "\n", method_name, __LINE__);
+                trace_printf("%s@%d" " After ReIntegrate" "\n", method_name, __LINE__);
         }
         else
         {
@@ -2382,16 +2746,10 @@
     {
         if ( ZClientEnabled )
         {
+#ifndef NAMESERVER_PROCESS
+            Nodes->AddConfiguredZNodes();
+#endif
             StartZookeeperClient();
-            // Set watch for master
-            if (IsAgentMode)
-            {
-                ZClient->WatchMasterNode( MasterMonitorName );
-            }
-            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-            {
-                trace_printf( "%s@%d (MasterMonitor) set watch for MasterMonitorName %s\n", method_name, __LINE__, MasterMonitorName );
-            }
         }
     }
 
@@ -2443,9 +2801,6 @@
     }
 #endif
 
-    // Create request worker threads
-    CReqWorker::startReqWorkers();
-
 #ifndef NAMESERVER_PROCESS
     if ( ! IAmIntegrating )
     {
@@ -2510,26 +2865,9 @@
         if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
            MonStats->MonitorBusyIncr();
 
-#ifndef NAMESERVER_PROCESS
         Monitor->EnterSyncCycle();
-        if ( Monitor->TmSyncPending() )
-        {
-            Monitor->TmSync ();
-        }
+        done = Monitor->exchangeNodeData();
         Monitor->ExitSyncCycle();
-#endif
-
-#ifndef NAMESERVER_PROCESS
-        if ( !Monitor->GetPendingSlaveTmSync() &&
-             Monitor->GetTotalSlaveTmSyncCount() == 0 )
-        {
-#endif
-            Monitor->EnterSyncCycle();
-            done = Monitor->exchangeNodeData();
-            Monitor->ExitSyncCycle();
-#ifndef NAMESERVER_PROCESS
-        }
-#endif
 
         if (done)
             break;
@@ -2552,7 +2890,6 @@
 
     if ( ZClientEnabled )
     {
-        ZClient->StopMonitoring();
         ZClient->ShutdownWork();
     }
 
diff --git a/core/sqf/monitor/linux/monitor.h b/core/sqf/monitor/linux/monitor.h
index 3e6cc3a..a40dce4 100644
--- a/core/sqf/monitor/linux/monitor.h
+++ b/core/sqf/monitor/linux/monitor.h
@@ -26,33 +26,22 @@
 #ifndef MONITOR_H_
 #define MONITOR_H_
 
-#include "tmsync.h"
+#include "msgdef.h"
+#include "cluster.h"
 #include "process.h"
 
 
-#define MAX_PROCESSES       2048
-#define MAX_IO_OUTSTANDING  MAX_PROCESSES*4
+#define MAX_PROCESSES            2048
+#define MAX_PORTFILEOPEN_RETRIES    60
+#define MAX_PORTFILEOPEN_DELAY       5   // seconds (5*60=300 = 5 min)
+#define MIN_PORTFILEOPEN_DELAY       1   // seconds (1*60=60  = 1 min)
+#define DEFAULT_PORTFILEOPEN_DELAY   2   // seconds (2*60=120 = 2 min)
 
 #define SUCCESS 0
 #define FAILURE 1
 
 
-enum OpType
-{
-    OpNull,
-    OpRecv,
-    OpSend
-#ifndef USE_BARRIER
-    ,
-    OpWake
-#endif
-};
-
-#ifdef NAMESERVER_PROCESS
 class CMonitor : public CCluster
-#else
-class CMonitor : public CTmSync_Container
-#endif
 {
 #ifndef NAMESERVER_PROCESS
 friend class SQ_LocalIOToClient;
diff --git a/core/sqf/monitor/linux/monlogging.cxx b/core/sqf/monitor/linux/monlogging.cxx
old mode 100755
new mode 100644
index e714263..42adff1
--- a/core/sqf/monitor/linux/monlogging.cxx
+++ b/core/sqf/monitor/linux/monlogging.cxx
@@ -36,7 +36,9 @@
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <sys/msg.h>
+#include <sys/resource.h>
 #include <errno.h>
+
 #include "seabed/logalt.h"
 #include "monlogging.h"
 #include "montrace.h"
@@ -44,12 +46,32 @@
 
 #define gettid() syscall(__NR_gettid)
 
+bool GenCoreOnFailureExit = false;
+
 extern bool IsRealCluster;
 extern int MyPNID;
 extern CMonLog *MonLog;
 
 pthread_mutex_t       MonLogMutex = PTHREAD_MUTEX_INITIALIZER;
 
+void mon_failure_exit( bool genCoreOnFailureExit )
+{
+    if (genCoreOnFailureExit || GenCoreOnFailureExit)
+    {
+        // Generate a core file, abort is intentional
+        abort();
+    }
+    else
+    {
+        // Don't generate a core file, abort is intentional
+        struct rlimit limit;
+        limit.rlim_cur = 0;
+        limit.rlim_max = 0;
+        setrlimit(RLIMIT_CORE, &limit);
+        abort();
+    }
+}
+
 int mon_log_write(int eventType, posix_sqlog_severity_t severity, char *msg)
 {
     if (MonLog->isUseAltLog())
@@ -123,29 +145,39 @@
     gethostname(hostname, MAX_PROCESSOR_NAME);
     char   logFileSuffix[MAX_FILE_NAME];
 
-    if (myNid_ != -1)
+    // Set flag to indicate whether we are operating in a real cluster
+    // or a virtual cluster.
+    if ( getenv("SQ_VIRTUAL_NODES") )
     {
-        sprintf( logFileSuffix, ".%s.%d.log"
-               , hostname
-               , myNid_);
+        IsRealCluster = false;
     }
-    else if (myPNid_ != -1)
-    {
-        sprintf( logFileSuffix, ".%s.%d.log"
-               , hostname
-               , myPNid_);
-    }
-    else if ( myPNid_ == -1 && !IsRealCluster)
-    {
-        sprintf( logFileSuffix, ".%s.%d.log"
-               , hostname
-               , myPid_);
-    }
-    else
+
+    if (IsRealCluster)
     {
         sprintf( logFileSuffix, ".%s.log"
                , hostname );
     }
+    else
+    {
+        if (myNid_ != -1)
+        {
+            sprintf( logFileSuffix, ".%d.%s.log"
+                   , myNid_
+                   , hostname);
+        }
+        else if (myPNid_ != -1)
+        {
+            sprintf( logFileSuffix, ".%d.%s.log"
+                   , myPNid_
+                   , hostname);
+        }
+        else
+        {
+            sprintf( logFileSuffix, ".%d.%s.log"
+                   , myPid_
+                   , hostname);
+        }
+    }
 
     CommonLogger::instance().initLog4cxx(log4cxxConfig_.c_str(), logFileSuffix);
 }
@@ -330,7 +362,7 @@
                 method_name, err, strerror(err));
         mon_log_write(MON_LOG_ERROR_1, SQ_LOG_ERR, la_buf);
     
-        abort(); 
+        mon_failure_exit();
     }
 
     memLogHeader_ = (memLogHeader_t *)shmat( memLogID_, NULL, 0 );
@@ -342,7 +374,7 @@
         sprintf(la_buf, "[%s], Error= Can't map shared memory segment address! - errno=%d (%s)\n", method_name, err, strerror(err));
         mon_log_write(MON_LOG_ERROR_2, SQ_LOG_CRIT, la_buf);
         
-        abort();
+        mon_failure_exit();
     }
 
     memLogBase_ = (memLogEntry_t *)(memLogHeader_ + 1);
diff --git a/core/sqf/monitor/linux/monlogging.h b/core/sqf/monitor/linux/monlogging.h
old mode 100755
new mode 100644
index ab40448..923e363
--- a/core/sqf/monitor/linux/monlogging.h
+++ b/core/sqf/monitor/linux/monlogging.h
@@ -42,6 +42,12 @@
 #define MEM_LOG_KEY( monpid ) ( (0x1234 << 16) + (monpid & 0xFFFF) )
 #define MEM_LOG_SIZE( entries ) ( sizeof(memLogHeader_t) + ( sizeof(memLogEntry_t) * entries ) )
 
+#define GENERATE_CORE true
+
+extern bool GenCoreOnFailureExit;
+
+extern void mon_failure_exit( bool genCoreOnFailureExit = false );
+
 // Used by monitor process
 int mon_log_write(int event_type, posix_sqlog_severity_t severity, char *evl_buf);
 
diff --git a/core/sqf/monitor/linux/monmemlog.cxx b/core/sqf/monitor/linux/monmemlog.cxx
index 6084b80..e23ea67 100644
--- a/core/sqf/monitor/linux/monmemlog.cxx
+++ b/core/sqf/monitor/linux/monmemlog.cxx
@@ -281,11 +281,11 @@
             printf("Sync thread not responsive, mpi time exceeded %d secs, stuck in ", entry->value1_);
             switch (entry->value2_)
             {
-                case 1: printf("Comm Dup");
+                case 4: printf("Comm Dup");
                         break;
-                case 2: printf("AllGather");
+                case 2: printf("Allgather");
                         break;
-                case 4: printf("Barrier");
+                case 1: printf("Barrier");
                         break;
             }
             printf("\n");
diff --git a/core/sqf/monitor/linux/monsonar.cxx b/core/sqf/monitor/linux/monsonar.cxx
index 8eeed40..fcede55 100644
--- a/core/sqf/monitor/linux/monsonar.cxx
+++ b/core/sqf/monitor/linux/monsonar.cxx
@@ -46,8 +46,6 @@
      req_type_shutdown (0),
      req_type_startup (0),
      req_type_tmleader (0),
-     req_type_tmseqnum (0),
-     req_type_tmsync (0),
      req_type_zoneinfo (0),
      msg_type_close (0),
      msg_type_unsolicited (0),
@@ -122,10 +120,6 @@
                      method_name, __LINE__, req_type_startup);
         trace_printf("%s@%d- Monitor Stats: req_type_tmleader=%llu\n",
                      method_name, __LINE__, req_type_tmleader);
-        trace_printf("%s@%d- Monitor Stats: req_type_tmseqnum=%llu\n",
-                     method_name, __LINE__, req_type_tmseqnum);
-        trace_printf("%s@%d- Monitor Stats: req_type_tmsync=%llu\n",
-                     method_name, __LINE__, req_type_tmsync);
         trace_printf("%s@%d- Monitor Stats: req_type_zoneinfo=%llu\n",
                      method_name, __LINE__, req_type_zoneinfo);
         trace_printf("%s@%d- Monitor Stats: msg_type_close=%llu\n",
@@ -207,8 +201,6 @@
           {MONITOR_REQTYPE_SHUTDOWN_CTR, &req_type_shutdown},
           {MONITOR_REQTYPE_STARTUP_CTR, &req_type_startup},
           {MONITOR_REQTYPE_TMLEADER_CTR, &req_type_tmleader},
-          {MONITOR_REQTYPE_TMSEQNUM_CTR, &req_type_tmseqnum},
-          {MONITOR_REQTYPE_TMSYNC_CTR, &req_type_tmsync},
           {MONITOR_REQTYPE_ZONEINFO_CTR, &req_type_zoneinfo},
           {MONITOR_SYNC_CYCLES_CTR, &req_sync},
 
diff --git a/core/sqf/monitor/linux/monsonar.h b/core/sqf/monitor/linux/monsonar.h
index 4a8e25e..0b37739 100644
--- a/core/sqf/monitor/linux/monsonar.h
+++ b/core/sqf/monitor/linux/monsonar.h
@@ -190,16 +190,6 @@
         req_type_tmleader++;
     }
 
-    virtual inline void req_type_tmseqnum_Incr()
-    {
-        req_type_tmseqnum++;
-    }
-
-    virtual inline void req_type_tmsync_Incr()
-    {
-        req_type_tmsync++;
-    }
-
     virtual inline void req_type_zoneinfo_Incr()
     {
         req_type_zoneinfo++;
@@ -383,8 +373,6 @@
     unsigned long long req_type_shutdown;
     unsigned long long req_type_startup;
     unsigned long long req_type_tmleader;
-    unsigned long long req_type_tmseqnum;
-    unsigned long long req_type_tmsync;
     unsigned long long req_type_zoneinfo;
 
     unsigned long long msg_type_close;
@@ -565,16 +553,6 @@
         req_type_tmleader.increment();
     }
 
-    virtual inline void req_type_tmseqnum_Incr()
-    {
-        req_type_tmseqnum.increment();
-    }
-
-    virtual inline void req_type_tmsync_Incr()
-    {
-        req_type_tmsync.increment();
-    }
-
     virtual inline void req_type_zoneinfo_Incr()
     {
         req_type_zoneinfo.increment();
@@ -703,8 +681,6 @@
     IncrementingCounter req_type_shutdown;
     IncrementingCounter req_type_startup;
     IncrementingCounter req_type_tmleader;
-    IncrementingCounter req_type_tmseqnum;
-    IncrementingCounter req_type_tmsync;
     IncrementingCounter req_type_zoneinfo;
     IncrementingCounter req_sync;
 
diff --git a/core/sqf/monitor/linux/montrace.cxx b/core/sqf/monitor/linux/montrace.cxx
index 96eab05..99c784b 100644
--- a/core/sqf/monitor/linux/montrace.cxx
+++ b/core/sqf/monitor/linux/montrace.cxx
@@ -117,20 +117,20 @@
     {
 #ifndef NAMESERVER_PROCESS
         sprintf(trace_file_name,"%s/monitor.trace.%d.%s",
-                getenv("MPI_TMPDIR"),MyPNID,Node_name);
+                getenv("TRAF_LOG"),MyPNID,Node_name);
 #else
         sprintf(trace_file_name,"%s/trafns.trace.%d.%s",
-                getenv("MPI_TMPDIR"),MyPNID,Node_name);
+                getenv("TRAF_LOG"),MyPNID,Node_name);
 #endif
     }
     else
     {
 #ifndef NAMESERVER_PROCESS
         sprintf(trace_file_name,"%s/monitor.trace.%s",
-                getenv("MPI_TMPDIR"), Node_name);
+                getenv("TRAF_LOG"), Node_name);
 #else
         sprintf(trace_file_name,"%s/trafns.trace.%s",
-                getenv("MPI_TMPDIR"), Node_name);
+                getenv("TRAF_LOG"), Node_name);
 #endif
     }
     remove(trace_file_name);
@@ -155,7 +155,7 @@
         }
         else // Make user specified file name unique per node
         {
-            sprintf(trace_file_name,"%s/%s.%d.%s",getenv("MPI_TMPDIR"),
+            sprintf(trace_file_name,"%s/%s.%d.%s",getenv("TRAF_LOG"),
                     pfname,MyPNID,Node_name);
         }
     }
@@ -294,7 +294,7 @@
             }
             else 
             {   // Make user specified file name unique per node
-                sprintf(fname,"%s/%s.%d.%s",getenv("MPI_TMPDIR"),
+                sprintf(fname,"%s/%s.%d.%s",getenv("TRAF_LOG"),
                         traceFileBase_,MyPNID,Node_name);
             }
         }
@@ -303,20 +303,20 @@
             if( getenv("SQ_VIRTUAL_NODES") )
             {
 #ifndef NAMESERVER_PROCESS
-                sprintf(fname,"%s/monitor.trace.%d.%s",getenv("MPI_TMPDIR"), MyPNID,
+                sprintf(fname,"%s/monitor.trace.%d.%s",getenv("TRAF_LOG"), MyPNID,
                         Node_name);
 #else
-                sprintf(fname,"%s/trafns.trace.%d.%s",getenv("MPI_TMPDIR"), MyPNID,
+                sprintf(fname,"%s/trafns.trace.%d.%s",getenv("TRAF_LOG"), MyPNID,
                         Node_name);
 #endif
             }
             else
             {
 #ifndef NAMESERVER_PROCESS
-                sprintf(fname,"%s/monitor.trace.%s",getenv("MPI_TMPDIR"),
+                sprintf(fname,"%s/monitor.trace.%s",getenv("TRAF_LOG"),
                         Node_name);
 #else
-                sprintf(fname,"%s/trafns.trace.%s",getenv("MPI_TMPDIR"),
+                sprintf(fname,"%s/trafns.trace.%s",getenv("TRAF_LOG"),
                         Node_name);
 #endif
             }
diff --git a/core/sqf/monitor/linux/monwdt.cxx b/core/sqf/monitor/linux/monwdt.cxx
index 5426c6f..f9a6e22 100644
--- a/core/sqf/monitor/linux/monwdt.cxx
+++ b/core/sqf/monitor/linux/monwdt.cxx
@@ -112,7 +112,7 @@
             if ( ! ClusterConfig.LoadConfig() )
             {
                 printf("[%s], Failed to load cluster configuration.\n", MyName);
-                abort();
+                exit(EXIT_FAILURE);
             }
         }
         else
@@ -122,7 +122,7 @@
             {
                 MyNid = 0;
             }
-            abort();
+            exit(EXIT_FAILURE);
         }
 
         lnodeConfig = ClusterConfig.GetLNodeConfig( MyNid );
@@ -139,7 +139,7 @@
         char tracefile[MAX_SEARCH_PATH];
         char *tmpDir;
     
-        tmpDir = getenv( "MPI_TMPDIR" );
+        tmpDir = getenv( "TRAF_LOG" );
         if (tmpDir)
         {
             sprintf( tracefile, "%s/monwdt.trace.%d", tmpDir, getpid() );
diff --git a/core/sqf/monitor/linux/msgdef.h b/core/sqf/monitor/linux/msgdef.h
index 8218a8e..e40f124 100644
--- a/core/sqf/monitor/linux/msgdef.h
+++ b/core/sqf/monitor/linux/msgdef.h
@@ -75,7 +75,7 @@
 #define MAX_NODE_MASKS   (MAX_NODES/MAX_NODE_BITMASK) // Node bit mask array size
 
 #define MAX_FAULT_ZONES  16 
-#define MAX_FILE_NAME    256
+#define MAX_FILE_NAME    1024
 #define MAX_KEY_NAME     64
 #define MAX_KEY_LIST     32
 #define MAX_NODE_LIST    64
@@ -106,6 +106,11 @@
 #define MAX_VALUE_SIZE   512
 #define MAX_VALUE_SIZE_INT 4096
 
+// The following defines specify the default values for the HA
+// timers if the timer related environment variables are not defined.
+// Defaults to 60 second Watchdog process timer expiration
+#define WDT_KEEPALIVETIMERDEFAULT 60
+
 // Use STRCPY when the size of the source string is variable and unknown.
 // Safe strcpy - checks that destination has enough capacity to hold
 // source string.  If not, source string is truncated.
@@ -155,6 +160,13 @@
 } PStartDEvent_t;
 
 typedef enum {
+    AgentType_Undefined=0,
+    AgentType_Ambari,
+    AgentType_CM,
+    AgentType_MPI
+} AgentType_t;
+
+typedef enum {
     ConfigType_Undefined=0,                 // Invalid
     ConfigType_Cluster,                     // Gobal to cluster configuration data
     ConfigType_Node,                        // Local to node configuration data
@@ -228,6 +240,7 @@
     ReqType_Event,                          // send target processes an Event notice
     ReqType_Exit,                           // process is exiting
     ReqType_Get,                            // retrieve information from the registry
+    ReqType_InstanceId,                     // get Cluster Id and Instance Id
     ReqType_Kill,                           // stop and cleanup the identified process
     ReqType_MonStats,                       // get monitor statistics
     ReqType_Mount,                          // mount device associated with process    
@@ -257,11 +270,8 @@
     ReqType_Shutdown,                       // request cluster shutdown
     ReqType_ShutdownNs,                     // request nameserver shutdown
     ReqType_Startup,                        // process startup notification
-    ReqType_Stfsd,                          // process stfsd request
     ReqType_TmLeader,                       // request to become the TM leader
     ReqType_TmReady,                        // request to indicate TM ready for transactions
-    ReqType_TmSync,                         // request to sync data across all TM's in cluster
-    ReqType_TransInfo,                      // request transaction enlistment information
     ReqType_ZoneInfo,                       // zone information request 
 
     ReqType_Invalid                         // marks the end of the request
@@ -278,6 +288,7 @@
     ReplyType_DelProcessNs,                 // reply with results
     ReplyType_Dump,                         // reply with dump info
     ReplyType_Get,                          // reply with configuration key/value pairs
+    ReplyType_InstanceId,                   // reply with Cluster Id and Instance Id
     ReplyType_MonStats,                     // reply with monitor statistics
     ReplyType_Mount,                        // reply with mount info
     ReplyType_NewProcess,                   // reply with new process information
@@ -289,13 +300,9 @@
     ReplyType_PNodeInfo,                    // reply with info on list of physical nodes
     ReplyType_ProcessInfo,                  // reply with info on list of processes
     ReplyType_ProcessInfoNs,                // reply with info of process
-    ReplyType_Stfsd,                        // reply with stfsd info
     ReplyType_Startup,                      // reply with startup info
-    ReplyType_TmSync,                       // reply from unsolicited TmSync message
-    ReplyType_TransInfo,                    // reply with transaction enlistment process list
     ReplyType_ZoneInfo,                     // reply with info on list of zones
 
-
     ReplyType_Invalid                       // marks the end of the reply types,
                                             // add any new reply types before
                                             // this one
@@ -314,7 +321,6 @@
     MsgType_NodeDeleted,                    // node deleted from configuration notification
     MsgType_NodeDown,                       // node is down notification
     MsgType_NodeJoining,                    // node is joining notification
-    MsgType_NodePrepare,                    // node prepare notification
     MsgType_NodeQuiesce,                    // node quiesce notification (always followed by node down)
     MsgType_NodeUp,                         // node is up notification
     MsgType_Open,                           // process open notification
@@ -324,10 +330,6 @@
     MsgType_Service,                        // request a service from the monitor
     MsgType_Shutdown,                       // system shutdown notification
     MsgType_SpareUp,                        // spare node is up notification
-    MsgType_TmRestarted,                    // DTM process restarted notification
-    MsgType_TmSyncAbort,                    // request to abort TM sync data previously received
-    MsgType_TmSyncCommit,                   // request to commit previously received TM sync data
-    MsgType_UnsolicitedMessage,             // Outgoing monitor msg expecting a reply 
 
     MsgType_Invalid                         // marks the end of the message
                                             // types, add any new message types 
@@ -473,6 +475,19 @@
     } list[MAX_KEY_LIST];
 };
 
+struct InstanceId_def
+{
+    int nid;                                // requesting process's node id
+    int pid;                                // requesting process id
+    Verifier_t verifier;                    // requesting process's verifier
+};
+
+struct InstanceId_reply_def
+{
+    int cluster_id;                         // this instance's cluster id
+    int instance_id;                        // this instance's instance id
+};
+
 struct Kill_def
 {
     int  nid;                               // requesting process's node id
@@ -1080,24 +1095,6 @@
     char fifo_stderr [MAX_PROCESS_PATH];
 };
 
-struct Stfsd_def
-{
-    int  nid;                               // Requester's node id
-    int  pid;                               // Requester's process id
-    int  tag;                               // Requester's tag
-    int  length;                            // The length in bytes used the data buffer
-    char data[MAX_STFSD_DATA];              // The data to be sent 
-};
-
-struct Stfsd_reply_def
-{
-    int  nid;                                // Replying STFSD's node id
-    int  pid;                                // Replying STFSD's process id
-    int  return_code;                        // If non-zero, error code
-    int  length;                             // The length in bytes of the data buffer
-    char data[MAX_STFSD_DATA];               // Reply data
-};
-
 struct TmLeader_def
 {
     int nid;                                // Requesting TM's node id
@@ -1110,70 +1107,6 @@
     int pid;                                // Requesting TM's process id
 };
 
-struct TmRestarted_def
-{
-    int  nid;                               // Restarted TM's logical node id
-    int  pnid;                              // Restarted TM's physical node id
-    char node_name[MPI_MAX_PROCESSOR_NAME]; // Restarted TM's physical node name
-};
-
-struct TmSync_def
-{
-    int  nid;                               // Requesting TM's node id
-    int  pid;                               // Requesting TM's process id
-    int  tag;                               // Requesting TM's tag
-    int  length;                            // The length in bytes used the data buffer
-    char data[MAX_SYNC_DATA];               // The data to be sent to all TM's in the cluster
-};
-
-struct TmSync_reply_def
-{
-    int nid;                                // Replying TM's node id
-    int pid;                                // Replying TM's process id
-    int handle;                             // Request associated handle for sync completed notice
-    int return_code;                        // If non-zero, TM not excepting sync data
-};
-
-struct TmSyncNotice_def
-{
-    int nid[MAX_TM_SYNCS];                  // Owning TM's node id
-    int orig_count;                         // Number of originator tags
-    int orig_tag[MAX_TM_SYNCS];             // Originator tag (only valid for orig)
-    int orig_handle[MAX_TM_SYNCS];          // Originator handle (only valid for orig)
-    int count;                              // Number of handles returned
-    int handle[MAX_TM_SYNCS];               // Requests associated handle for sync completed notice
-};
-
-struct UnsolicitedTmSync_def
-{
-    int  nid;                               // Requesting TM's node id or target TM's node id
-    int  pid;                               // Requesting TM's process id or target  TM's process id
-    int  handle;                            // Request associated handle for sync completed notice
-    int  length;                            // The length in bytes used the data buffer
-    char data[MAX_SYNC_DATA];               // The data to be sent to all TM's in the cluster
-};
-
-struct TransInfo_def
-{ // Deprecated
-    int  nid;                               // Requesting process's node id
-    int  pid;                               // Requesting process id
-    char process_name[MAX_PROCESS_NAME];    // Name of process to list associated transactions
-                                            // If NULL then use trans-id to list assocaited process
-    _TM_Txid_External trans_id;             // Transaction ID of enlisted processes to list
-};
-
-struct TransInfo_reply_def
-{ // Deprecated
-    int  num_processes;                     // Number of process returned
-    struct 
-    {
-        int  nid;                           // Associated process's node id
-        int  pid;                           // Associated process's process id
-        _TM_Txid_External trans_id;         // Transiaction ID associated with process
-    } procs[MAX_PROC_LIST];
-    int  return_code;                       // Error returned to sender
-};
-
 struct ZoneInfo_def
 {
     int nid;                                // node id of requesting process
@@ -1218,6 +1151,7 @@
         struct Event_Notice_def      event_notice;
         struct Exit_def              exit;
         struct Get_def               get;
+        struct InstanceId_def        instance_id;
         struct Mount_def             mount;
         struct Kill_def              kill;
         struct NameServerAdd_def     nameserver_add;
@@ -1243,16 +1177,8 @@
         struct Shutdown_def          shutdown;
         struct ShutdownNs_def        shutdown_ns;
         struct Startup_def           startup;
-#ifdef SQ_STFSD
-        struct Stfsd_def             stfsd;
-#endif
         struct TmLeader_def          leader;
         struct TmReady_def           tm_ready;
-        struct TmRestarted_def       tm_restart;
-        struct TmSync_def            tm_sync;
-        struct TmSyncNotice_def      tm_sync_notice;
-        struct TransInfo_def         trans_info;
-        struct UnsolicitedTmSync_def unsolicited_tm_sync;
         struct NodeUp_def            up;
         struct NodeQuiesce_def       quiesce;
         struct NodePrepare_def       prepare;
@@ -1273,6 +1199,7 @@
         struct Dump_reply_def          dump;
         struct Generic_reply_def       generic;
         struct Get_reply_def           get;
+        struct InstanceId_reply_def    instance_id;
         struct Mount_reply_def         mount;
         struct NewProcess_reply_def    new_process;
         struct NewProcessNs_reply_def  new_process_ns;
@@ -1283,13 +1210,7 @@
         struct ProcessInfo_reply_def   process_info;
         struct ProcessInfoNs_reply_def process_info_ns;
         struct Startup_reply_def       startup_info;
-#ifdef SQ_STFSD
-        struct Stfsd_reply_def         stfsd;
-#endif
         int                            tm_seqnum;
-        struct TmSync_reply_def        tm_sync;
-        struct TransInfo_reply_def     trans_info;
-        struct TmSync_reply_def        unsolicited_tm_sync;
         struct Close_reply_def         close;
         struct MonStats_reply_def      mon_info;
         struct ZoneInfo_reply_def      zone_info;
@@ -1304,7 +1225,6 @@
     int reply_tag;
     union
     {
-        int                 handle;    // used only for TmSync notices
         struct request_def  request;
         struct reply_def    reply;
     } u;
diff --git a/core/sqf/monitor/linux/pnode.cxx b/core/sqf/monitor/linux/pnode.cxx
index 4c21729..54c6254 100644
--- a/core/sqf/monitor/linux/pnode.cxx
+++ b/core/sqf/monitor/linux/pnode.cxx
@@ -49,11 +49,11 @@
 #include "pnode.h"
 #include "mlio.h"
 #include "nameserver.h"
-
 #include "replicate.h"
 #include "reqqueue.h"
 #include "healthcheck.h"
 #ifndef NAMESERVER_PROCESS
+#include "zclient.h"
 #include "ptpclient.h"
 #endif
 
@@ -85,11 +85,15 @@
 
 const char *StateString( STATE state);
 #ifndef NAMESERVER_PROCESS
+extern const char *ProcessTypeString( PROCESSTYPE type );
 const char *SyncStateString( SyncState state);
 extern CPtpClient *PtpClient;
 extern CNameServer *NameServer;
 extern CProcess *NameServerProcess;
 extern bool NameServerEnabled;
+extern bool ZClientEnabled;
+extern bool IsMaster;
+extern CZClient *ZClient;
 #endif
 extern CNameServerConfigContainer *NameServerConfig;
 
@@ -114,11 +118,6 @@
 #define WDIOC_SQ_GETTIMEOUT        _IOR(WATCHDOG_IOCTL_BASE, 7, int)
 
 
-// The following defines specify the default values for the HA
-// timers if the timer related environment variables are not defined.
-// Defaults to 60 second Watchdog process timer expiration
-#define WDT_KeepAliveTimerDefault 60
-
 // Default interval used by GetSchedulingData (in milliseconds)
 unsigned long int CNode::minSchedDataInterval_ = 500;
 
@@ -141,7 +140,11 @@
 size_t CNode::memInfoStringLen_[memFinalItem];
 
 
-CNode::CNode( char *name, int pnid, int rank )
+CNode::CNode( char *name
+            , char *domain
+            , char *fqdn
+            , int pnid
+            , int rank )
       :CLNodeContainer(this)
       ,CProcessContainer(true)
       ,pnid_(pnid)
@@ -153,6 +156,10 @@
       ,killingNode_(false)
       ,dtmAborted_(false)
       ,smsAborted_(false)
+      ,pendingNodeDown_(false)
+      ,primitiveDtmUp_(false)
+      ,primitivePsdUp_(false)
+      ,primitiveWdgUp_(false)
       ,lastLNode_(NULL)
       ,lastSdLevel_(ShutdownLevel_Undefined)
       ,rankFailure_(false)
@@ -165,13 +172,9 @@
       ,next_(NULL)
       ,prev_(NULL)
       ,rank_(rank)
-#ifndef NAMESERVER_PROCESS
-      ,tmSyncNid_(-1)
-      ,tmSyncState_(SyncState_Null)
-#endif
       ,shutdownLevel_(ShutdownLevel_Undefined)
       ,shutdownNameServer_(false)
-      ,wdtKeepAliveTimerValue_(WDT_KeepAliveTimerDefault)
+      ,wdtKeepAliveTimerValue_(WDT_KEEPALIVETIMERDEFAULT)
       ,zid_(pnid)
       ,commPort_("")
       ,syncPort_("")
@@ -210,6 +213,8 @@
     prevSchedData_.tv_nsec = 0;
 
     STRCPY(name_, name);
+    STRCPY(domain_, domain);
+    STRCPY(fqdn_, fqdn);
     
     hostname_ = name;
     size_t pos = hostname_.find_first_of( ".:" );
@@ -272,12 +277,12 @@
 #endif
     internalState_ = State_Default; 
 
-    uniqStrId_ = Config->getMaxUniqueId ( pnid_ ) + 1;
-
     TRACE_EXIT;
 }
 
 CNode::CNode( char *name
+            , char *domain
+            , char *fqdn
             , int   pnid
             , int   rank
             , int   sparePNidCount
@@ -294,6 +299,7 @@
       ,killingNode_(false)
       ,dtmAborted_(false)
       ,smsAborted_(false)
+      ,pendingNodeDown_(false)
       ,lastLNode_(NULL)
       ,lastSdLevel_(ShutdownLevel_Undefined)
       ,excludedCoreMask_(excludedCoreMask)
@@ -307,11 +313,9 @@
       ,next_(NULL)
       ,prev_(NULL)
       ,rank_(rank)
-      ,tmSyncNid_(-1)
-      ,tmSyncState_(SyncState_Suspended)
       ,shutdownLevel_(ShutdownLevel_Undefined)
       ,shutdownNameServer_(false)
-      ,wdtKeepAliveTimerValue_(WDT_KeepAliveTimerDefault)
+      ,wdtKeepAliveTimerValue_(WDT_KEEPALIVETIMERDEFAULT)
       ,zid_(-1)
       ,commPort_("")
       ,syncPort_("")
@@ -336,6 +340,8 @@
     memcpy(&eyecatcher_, "PNOD", 4);
 
     STRCPY(name_, name);
+    STRCPY(domain_, domain);
+    STRCPY(fqdn_, fqdn);
     
     hostname_ = name;
     size_t pos = hostname_.find_first_of( ".:" );
@@ -403,8 +409,6 @@
 #endif
     internalState_ = State_Default; 
 
-    uniqStrId_ = Config->getMaxUniqueId ( pnid_ ) + 1;
-
     TRACE_EXIT;
 }
 
@@ -549,11 +553,9 @@
     if ( tmReady )
     {
         if (trace_settings & (TRACE_INIT | TRACE_SYNC | TRACE_TMSYNC))
-            trace_printf("%s@%d - Activation Phase_Ready on node %s, pnid=%d\n", method_name, __LINE__, GetName(), GetPNid());
+            trace_printf("%s@%d - Setting Phase_Ready on node %s, pnid=%d\n", method_name, __LINE__, GetName(), GetPNid());
         phase_ = Phase_Ready;
-#ifndef NAMESERVER_PROCESS
-        tmSyncState_ = SyncState_Null;
-#endif
+        HealthCheck.triggerTimeToLogHealth();
     }
 
     TRACE_EXIT;
@@ -1040,23 +1042,28 @@
 strId_t CNode::GetStringId( char *candidate, CLNode *targetLNode, bool clone )
 {
     const char method_name[] = "CNode::GetStringId";
-    strId_t id;
-
     TRACE_ENTRY;
 
-    if ( ! Config->findUniqueString ( pnid_, candidate, id ) )
-    {   // The string is not in the configuration database, add it
-        id.id  = uniqStrId_++;
-        id.nid = pnid_;
+    strId_t existStrId;
+    strId_t strId;
+    string  existUString;
 
-        if (trace_settings & TRACE_PROCESS)
+    if ( ! Config->getUniqueStringId( pnid_, candidate, strId ) )
+    {   // The candidate string is not in the configuration database
+        if (uniqStrId_ == -1)
+        {   // Get the last unique string id assigned
+            uniqStrId_ = Config->getMaxUniqueId( pnid_ );
+        }
+        existStrId.nid = pnid_;
+        existStrId.id  = ++uniqStrId_;
+        while (Config->getUniqueString(existStrId.nid, existStrId.id, existUString))
         {
-            trace_printf("%s@%d - Adding unique string id=[%d,%d] (%s), targetLnode=%p, targetNid=%d\n",
-                         method_name, __LINE__, id.nid, id.id, candidate, 
-                         targetLNode, targetLNode?targetLNode->GetNid():-1 );
+            existStrId.id  = ++uniqStrId_;
         }
 
-        Config->addUniqueString(id.nid, id.id, candidate);
+        strId.nid = pnid_;
+        strId.id  = uniqStrId_;
+        Config->addUniqueString(strId.nid, strId.id, candidate);
 
 #ifndef NAMESERVER_PROCESS
         if (NameServerEnabled)
@@ -1065,8 +1072,8 @@
                 !MyNode->IsMyNode(targetLNode->GetNid()))
             {
                 // Forward the unique string to the target node
-                int rc = PtpClient->ProcessAddUniqStr( id.nid
-                                                     , id.id
+                int rc = PtpClient->ProcessAddUniqStr( strId.nid
+                                                     , strId.id
                                                      , candidate
                                                      , targetLNode->GetNid()
                                                      , targetLNode->GetNode()->GetName() );
@@ -1090,7 +1097,7 @@
             clone = clone;  // Make compiler happy!
             targetLNode = targetLNode;  // Make compiler happy!
 #endif
-            CReplUniqStr *repl = new CReplUniqStr ( id.nid, id.id, candidate );
+            CReplUniqStr *repl = new CReplUniqStr ( strId.nid, strId.id, candidate );
             Replicator.addItem(repl);
         }
     }
@@ -1100,7 +1107,7 @@
         if (trace_settings & TRACE_PROCESS)
         {
             trace_printf("%s@%d - unique string id=[%d,%d] (%s)\n",
-                         method_name, __LINE__, id.nid, id.id, candidate );
+                         method_name, __LINE__, strId.nid, strId.id, candidate );
         }
 
 #ifndef NAMESERVER_PROCESS
@@ -1110,8 +1117,8 @@
                 !MyNode->IsMyNode(targetLNode->GetNid()))
             {
                 // Forward the unique string to the target node
-                int rc = PtpClient->ProcessAddUniqStr( id.nid
-                                                     , id.id
+                int rc = PtpClient->ProcessAddUniqStr( strId.nid
+                                                     , strId.id
                                                      , candidate
                                                      , targetLNode->GetNid()
                                                      , targetLNode->GetNode()->GetName());
@@ -1133,16 +1140,81 @@
 
     TRACE_EXIT;
 
-    return id;
+    return strId;
 }
 
-CNode *CNode::Link( CNode * entry )
+CNode *CNode::LinkAfter( CNode * &tail, CNode * entry )
 {
-    const char method_name[] = "CNode::Link";
+    const char method_name[] = "CNode::LinkAfter";
     TRACE_ENTRY;
-    next_ = entry;
-    entry->prev_ = this;
 
+    entry->prev_ = this;
+    if (next_ == NULL)
+    {
+        entry->next_ = NULL;
+        tail = entry;
+    }
+    else
+    {
+        entry->next_ = next_;
+        next_->prev_ = entry;
+    }
+    next_ = entry;
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d - Linked physical node object "
+                      "tail=%d\n"
+                      "\t\tthis: prev=%d, this=%d, next=%d\n"
+                      "\t\tentry: prev=%d, entry=%d, next=%d\n"
+                    , method_name, __LINE__
+                    , tail->GetPNid()
+                    , prev_?prev_->GetPNid():-1
+                    , GetPNid()
+                    , next_?next_->GetPNid():-1
+                    , entry->prev_?entry->prev_->GetPNid():-1
+                    , entry->GetPNid()
+                    , entry->next_?entry->next_->GetPNid():-1 );
+    }
+    
+    TRACE_EXIT;
+    return entry;
+}
+
+CNode *CNode::LinkBefore( CNode * &head, CNode * entry )
+{
+    const char method_name[] = "CNode::LinkBefore";
+    TRACE_ENTRY;
+
+    entry->next_ = this;
+    if (prev_ == NULL)
+    {
+        entry->prev_ = NULL;
+        head = entry;
+    }
+    else
+    {
+        entry->prev_ = prev_;
+        prev_->next_ = entry;
+    }
+    prev_ = entry;
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d - Linked physical node object "
+                      "head=%d\n"
+                      "\t\tthis: prev=%d, this=%d, next=%d\n"
+                      "\t\tentry: prev=%d, entry=%d, next=%d\n"
+                    , method_name, __LINE__
+                    , head->GetPNid()
+                    , prev_?prev_->GetPNid():-1
+                    , GetPNid()
+                    , next_?next_->GetPNid():-1
+                    , entry->prev_?entry->prev_->GetPNid():-1
+                    , entry->GetPNid()
+                    , entry->next_?entry->next_->GetPNid():-1 );
+    }
+    
     TRACE_EXIT;
     return entry;
 }
@@ -1235,6 +1307,179 @@
     TRACE_EXIT;
 }
 
+void CNode::GetPersistProcessAttributes( CPersistConfig *persistConfig
+                                       , int             nid
+                                       , PROCESSTYPE    &processType
+                                       , char           *processName
+                                       , char           *programName
+                                       , int            &programArgc
+                                       , char           *programArgs
+                                       , char           *outfile
+                                       , char           *persistRetries
+                                       , char           *persistZones )
+{
+    const char method_name[] = "CNode::GetPersistProcessAttributes";
+    char zoneStr[MAX_PERSIST_VALUE_STR];
+
+    processType = persistConfig->GetProcessType();
+
+    switch (persistConfig->GetZoneZidFormat())
+    {
+    case Zid_ALL:
+        sprintf( zoneStr, "%d (ALL)", -1 );
+        strcat( persistZones, zoneStr );
+        break;
+    case Zid_RELATIVE:
+    default:
+        sprintf( zoneStr, "%d", nid );
+        strcpy( persistZones, zoneStr );
+        break;
+    }
+
+    if ( nid == -1 )
+    {
+        sprintf( processName, "%s"
+               , persistConfig->GetProcessNamePrefix() );
+        sprintf( outfile, "%s"
+               , persistConfig->GetStdoutPrefix() );
+    }
+    else
+    {
+        sprintf( processName, "%s%d"
+               , persistConfig->GetProcessNamePrefix()
+               , nid );
+        sprintf( outfile, "%s%d"
+               , persistConfig->GetStdoutPrefix()
+               , nid );
+    }
+
+    sprintf( programName, "%s", persistConfig->GetProgramName() );
+
+    programArgc = persistConfig->GetProgramArgc();
+    if (programArgc)
+    {
+        sprintf( programArgs, "%s"
+               , persistConfig->GetProgramArgs() );
+    }
+
+    sprintf( persistRetries, "%d,%d"
+           , persistConfig->GetPersistRetries()
+           , persistConfig->GetPersistWindow() );
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_PROCESS | TRACE_PROCESS_DETAIL))
+        trace_printf( "%s@%d Persist process Nid=%d, "
+                      "processName=%s, type=%s, stdout=%s, "
+                      "persistRetries=%s, persistZones=%s\n"
+                    , method_name, __LINE__
+                    , nid, processName
+                    , ProcessTypeString(persistConfig->GetProcessType())
+                    , outfile
+                    , persistRetries
+                    , persistZones );
+}
+
+void CNode::StartDtmProcess( void )
+{
+    const char method_name[] = "CNode::StartDtmProcess";
+    TRACE_ENTRY;
+
+    bool debug = false;
+    bool nowait = false;
+    char infile[MAX_PROCESS_PATH];
+    char *ldpath = NULL;
+    char path[MAX_SEARCH_PATH];
+    char processName[MAX_PROCESS_NAME];
+    char programArgs[MAX_VALUE_SIZE_INT];
+    char programName[MAX_PROCESS_NAME];
+    char outfile[MAX_PROCESS_PATH];
+    char persistRetries[MAX_PERSIST_VALUE_STR];
+    char persistZones[MAX_VALUE_SIZE_INT];
+    char stdout[MAX_PROCESS_PATH];
+    int nid = MyNode->AssignNid();
+    int programArgc = 0;
+    PROCESSTYPE processType = ProcessType_DTM;
+    CProcess* dtmProcess;
+    CClusterConfig* clusterConfig = Nodes->GetClusterConfig();
+    CPersistConfig* persistConfig = NULL;
+    
+    assert(clusterConfig != NULL);
+
+    persistConfig = clusterConfig->GetPersistConfig( "DTM" );
+    if (persistConfig == NULL)
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], Persistent process configuration for DTM is missing!\n"
+                , method_name );
+        mon_log_write(MON_NODE_STARTDTMPROCESS_1, SQ_LOG_ERR, buf);
+        abort();
+    }
+
+    GetPersistProcessAttributes( persistConfig
+                               , nid
+                               , processType
+                               , processName
+                               , programName
+                               , programArgc
+                               , programArgs
+                               , outfile
+                               , persistRetries
+                               , persistZones );
+
+    const char *logpath = getenv("TRAF_LOG");
+    snprintf( stdout, sizeof(stdout)
+            , "%s/%s"
+            , logpath, outfile );
+
+    if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
+    {
+        trace_printf( "%s@%d - Process %s, logpath=%s, outfile=%s, stdout=%s\n"
+                    , method_name, __LINE__
+                    , processName, logpath, outfile, stdout);
+    }
+
+    strcpy(path,getenv("PATH"));
+    strcat(path,":");
+    strcat(path,MyPath);
+    ldpath = getenv("LD_LIBRARY_PATH");
+    strId_t pathStrId = MyNode->GetStringId ( path );
+    strId_t ldpathStrId = MyNode->GetStringId ( ldpath );
+    strId_t programStrId = MyNode->GetStringId ( programName );
+
+    int result;
+    dtmProcess  = CreateProcess( NULL               // parent
+                               , nid
+                               , ProcessType_DTM
+                               , 0                  // debug
+                               , 0                  // priority
+                               , 0                  // backup
+                               , true               // unhooked
+                               , processName
+                               , pathStrId 
+                               , ldpathStrId 
+                               , programStrId 
+                               , (char *) ""        // infile
+                               , stdout             // outfile
+                               , 0                  // tag
+                               , result );
+    if ( dtmProcess )
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+           trace_printf( "%s@%d - DTM process created (%s)\n"
+                       , method_name, __LINE__, processName );
+    }
+    else
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        sprintf(buf
+               , "[%s], DTM process creation failed! (%s)\n"
+               , method_name, processName );
+        mon_log_write( MON_NODE_STARTDTMPROCESS_2, SQ_LOG_ERR, buf );
+    }
+
+    TRACE_EXIT;
+}
+
 void CNode::StartNameServerProcess( void )
 {
     const char method_name[] = "CNode::StartNameServerProcess";
@@ -1335,7 +1580,7 @@
 
     if (!(WDT_KeepAliveTimerValueC = getenv("SQ_WDT_KEEPALIVETIMERVALUE")))
     {
-        wdtKeepAliveTimerValue_ = WDT_KeepAliveTimerDefault;
+        wdtKeepAliveTimerValue_ = WDT_KEEPALIVETIMERDEFAULT;
     }
     else
     {
@@ -1489,7 +1734,7 @@
                 // Send local PSD process event to start persistent processes 
                 // that don't require transactions
                 process = lnode->GetProcessLByType( ProcessType_PSD );
-                if ( process && process->IsFirstInstance() )
+                if ( process )
                 {
                     char nidString[6];
                     sprintf(nidString,"%d",lnode->GetNid());
@@ -1720,6 +1965,32 @@
 }
 
 
+int CNodeContainer::GetPNodesUpCount( int &readyCount )
+{
+    const char method_name[] = "CNodeContainer::GetPNodesUpCount";
+    TRACE_ENTRY;
+
+    int upCount = 0;
+    readyCount = 0;
+
+    CNode *node = head_;
+    while (node)
+    {
+        if ( node->GetState() == State_Up )
+        { 
+            upCount++;
+            if (node->GetPhase() == Phase_Ready)
+            {
+                readyCount++;
+            }
+        }
+        node = node->GetNext();
+    }
+
+    TRACE_EXIT;
+    return( upCount );
+}
+
 int CNodeContainer::GetPNid( char *nodeName )
 {
     const char method_name[] = "CNodeContainer::GetPNid";
@@ -1796,6 +2067,26 @@
     TRACE_EXIT;
     return(process);
 }
+
+void CNodeContainer::AddConfiguredZNodes( void )
+{
+    const char method_name[] = "CNodeContainer::AddConfiguredZNodes";
+    TRACE_ENTRY;
+
+    if (ZClientEnabled)
+    {
+        if (!IsAgentMode || (IsAgentMode && IsMaster))
+        {
+            CPNodeConfig *pnodeConfig = clusterConfig_->GetFirstPNodeConfig();
+            for ( ; pnodeConfig; pnodeConfig = pnodeConfig->GetNext() )
+            {
+                ZClient->ConfiguredZNodeCreate( pnodeConfig->GetName() );
+            }
+        }
+    }
+
+    TRACE_EXIT;
+}
 #endif
 
 CNode *CNodeContainer::AddNode( int pnid )
@@ -1811,7 +2102,10 @@
         if (!node)
         {
             node = new CNode( (char *)pnodeConfig->GetName()
-                            , pnodeConfig->GetPNid(), -1 );
+                            , (char *)pnodeConfig->GetDomain()
+                            , (char *)pnodeConfig->GetFqdn()
+                            , pnodeConfig->GetPNid()
+                            , -1 );
             assert( node != NULL );
     
             if ( node )
@@ -1821,6 +2115,13 @@
 
                 // Broadcast node added notice to local processes
                 AddedNode( node );
+
+                char buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf)
+                        , "[%s@%d] Node %s added to configuration, pnid=%d\n"
+                        , method_name, __LINE__
+                        , node->GetName(), node->GetPNid() );
+                mon_log_write(MON_NODE_ADDNODE_4, SQ_LOG_INFO, buf);
             }
             else
             {
@@ -1887,7 +2188,33 @@
         }
         else
         {
-            tail_ = tail_->Link(node);
+            // add to list in pnid sort order
+            if (node->GetPNid() < head_->GetPNid())
+            { // link new node to the begining
+                head_->LinkBefore( head_, node );
+            }
+            else if (node->GetPNid() > tail_->GetPNid())
+            { // link new node to the end
+                tail_->LinkAfter( tail_, node );
+            }
+            else
+            {
+                CNode *entry = head_;
+                CNode *prevEntry = NULL;
+                while (entry)
+                { // walk the list
+                    if (node->GetPNid() > entry->GetPNid())
+                    { // new node is greater than current list entry
+                        prevEntry = entry;
+                        entry = prevEntry->GetNext();
+                    }
+                    else
+                    { // new node is less than current list entry
+                        prevEntry->LinkAfter( tail_, node );
+                        entry = NULL;
+                    }
+                }
+            }
         }
 #ifdef NAMESERVER_PROCESS
         AddLNodes( node );
@@ -1953,6 +2280,8 @@
             // add the spare node's pnid to the spare set
             sparePNids[pnodeConfig->GetSparesCount()] = pnid;
             node = new CNode( (char *)pnodeConfig->GetName()
+                            , (char *)pnodeConfig->GetDomain()
+                            , (char *)pnodeConfig->GetFqdn()
                             , pnid
                             , rank 
                             , pnodeConfig->GetSparesCount()+1
@@ -1974,7 +2303,11 @@
             {
                 rank = -1; // -1 creates node in down state
             }
-            node = new CNode( (char *)pnodeConfig->GetName(), pnid, rank );
+            node = new CNode( (char *)pnodeConfig->GetName()
+                            , (char *)pnodeConfig->GetDomain()
+                            , (char *)pnodeConfig->GetFqdn()
+                            , pnid
+                            , rank );
             assert( node != NULL );
         }
         
@@ -2018,6 +2351,32 @@
     TRACE_EXIT;
 }
 
+void CNodeContainer::AddLNodes( )
+{
+    const char method_name[] = "CNodeContainer::AddLNodes";
+    TRACE_ENTRY;
+
+    CNode *node;
+    int pnid;
+
+    CPNodeConfig *pnodeConfig = clusterConfig_->GetFirstPNodeConfig();
+    for ( ; pnodeConfig; pnodeConfig = pnodeConfig->GetNext() )
+    {
+        pnid = pnodeConfig->GetPNid();
+
+        node = Node[pnid];
+        assert( node != NULL );
+
+        // now add logical nodes to physical node
+        if (!IAmIntegrating)
+        {
+             AddLNodes( node );
+        }
+    }
+
+    TRACE_EXIT;
+}
+
 void CNodeContainer::AddLNodes( CNode  *node )
 {
     const char method_name[] = "CNodeContainer::AddLNodes";
@@ -2104,7 +2463,8 @@
                       "configuration of physical node, pnid=%d\n"
                     , method_name, __LINE__, node2->GetPNid() );
             mon_log_write(MON_NODE_ADDLNODES_3, SQ_LOG_ERR, buf);
-            abort();
+
+            mon_failure_exit();
         }
     }
     else
@@ -2115,7 +2475,8 @@
                   "pnid=%d\n"
                 , method_name, __LINE__, node2->GetPNid());
         mon_log_write(MON_NODE_ADDLNODES_4, SQ_LOG_ERR, buf);
-        abort();
+
+        mon_failure_exit();
     }
 
     TRACE_EXIT;
@@ -2750,17 +3111,27 @@
     TRACE_ENTRY;
 
     int rs = true;
+    string nodeName;
 
     CNode *pnode = GetNode( pnid );
     if ( pnode )
     {
+        nodeName = pnode->GetName();
         // Broadcast node deleted notice to local processes
         Nodes->DeletedNode( pnode );
 
         // Now delete it from the monitor's view
         Nodes->DeleteNode( pnode );
+
         // Verify it was deleted, sanity check!
-        assert( (pnode = Nodes->GetNode( pnid )) == NULL );
+        if ((pnode = Nodes->GetNode( pnid )) == NULL )
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s@%d] Node %s deleted from configuration, pnid=%d\n"
+                    , method_name, __LINE__, nodeName.c_str() , pnid);
+            mon_log_write(MON_NODE_DELETENODE_2, SQ_LOG_INFO, buf);
+        }
     }
     else
     {
@@ -3507,68 +3878,6 @@
     TRACE_EXIT;
     return( process );
 }
-
-SyncState CNodeContainer::GetTmState ( SyncState check_state )
-{
-    SyncState state = check_state;
-    CNode *node = head_;
-    const char method_name[] = "CNodeContainer::GetTmState";
-    TRACE_ENTRY;
-    
-    while (node)
-    {
-        if ( node->GetState() == State_Up && ! node->IsSpareNode() && node->GetPhase() == Phase_Ready)
-        {
-            if ( check_state == SyncState_Start )
-            {
-                if ( node->GetPNid() == MyPNID )
-                {
-                    if ( node->GetTmSyncState() != SyncState_Start )
-                    {
-                        state = SyncState_Abort;
-                        if (trace_settings & TRACE_TMSYNC)
-                           trace_printf("%s@%d" " - Node %s, pnid=%d" " no longer in Master Sync Start state" "\n", method_name, __LINE__, node->GetName(), node->GetPNid());
-                        break;
-                    }
-                }
-                else
-                {
-                    if ( node->GetTmSyncState() != SyncState_Continue )
-                    {
-                        state = SyncState_Abort;
-                        if (trace_settings & TRACE_TMSYNC)
-                           trace_printf("%s@%d" " - Node %s, pnid=%d" " doesn't agree on Sync Start state, returned state=" "%d" "\n", method_name, __LINE__, node->GetName(), node->GetPNid(), node->GetTmSyncState());
-                        break;
-                    }
-                }
-            }
-            else
-            {
-                if ( check_state == SyncState_Suspended )
-                {
-                    state = node->GetTmSyncState();
-                    if ( state == SyncState_Suspended )
-                    {
-                        if (trace_settings & TRACE_TMSYNC)
-                           trace_printf("%s@%d" " - Node %s, pnid=%d" " is in TmSync Suspended state\n", method_name, __LINE__, node->GetName(), node->GetPNid());
-                        break;
-                    }
-                }
-                else if ( node->GetTmSyncState() != check_state )
-                {
-                    state = node->GetTmSyncState();
-                    if (trace_settings & TRACE_TMSYNC)
-                       trace_printf("%s@%d" " - Node %s, pnid=%d" " doesn't agree on TmState, returned state=" "%d" "\n", method_name, __LINE__, node->GetName(), node->GetPNid(), state);
-                    break;
-                }
-            }
-        }
-        node = node->GetNext ();
-    }
-    
-    TRACE_EXIT;
-    return state;
-}
 #endif
 
 CNode *CNodeContainer::GetZoneNode(int zid)
@@ -3591,6 +3900,34 @@
     return node;
 }
 
+void CNodeContainer::InitRecvBuffer( struct sync_buffer_def *recvBuf )
+{
+    const char method_name[] = "CNodeContainer::InitRecvBuffer";
+    TRACE_ENTRY;
+
+    struct internal_msg_def *msg;
+    struct sync_buffer_def  *rBuf;
+
+    for (int i = 0; i < GetPNodesCount(); i++)
+    {
+        rBuf = &recvBuf[indexToPnid_[i]];
+
+        rBuf->nodeInfo.node_state    = State_Unknown;
+        rBuf->nodeInfo.sdLevel       = ShutdownLevel_Undefined;
+        rBuf->nodeInfo.tmSyncState   = SyncState_Null;
+        rBuf->nodeInfo.internalState = State_Default;
+        rBuf->nodeInfo.change_nid    = -1;
+        rBuf->nodeInfo.seq_num       = 0;
+        rBuf->msgInfo.msg_count = 0;
+        rBuf->msgInfo.msg_offset = 0;
+
+        msg = (struct internal_msg_def *) &rBuf->msg[0];
+        msg->type = InternalType_Null;
+    }
+
+    TRACE_EXIT;
+}
+
 struct internal_msg_def *
 CNodeContainer::InitSyncBuffer( struct sync_buffer_def *syncBuf
                               , unsigned long long seqNum
@@ -3603,9 +3940,6 @@
 
     syncBuf->nodeInfo.node_state    = MyNode->GetState();
     syncBuf->nodeInfo.sdLevel       = MyNode->GetShutdownLevel();
-#ifndef NAMESERVER_PROCESS
-    syncBuf->nodeInfo.tmSyncState   = MyNode->GetTmSyncState();
-#endif
     syncBuf->nodeInfo.internalState = MyNode->getInternalState();
     syncBuf->nodeInfo.change_nid    = -1;
     syncBuf->nodeInfo.seq_num       = seqNum;
@@ -3664,9 +3998,47 @@
     syncBufferFreeSpace_ = ( MAX_SYNC_SIZE - 
                              (sizeof(cluster_state_def_t) + sizeof(msgInfo_t)));
 
+    TRACE_EXIT;
     return msg;
 }
 
+bool CNodeContainer::IsMyNodeFirstInConfigUp( void )
+{
+    const char method_name[] = "CNodeContainer::IsMyNodeFirstInConfigUp";
+    TRACE_ENTRY;
+
+    int pnid;
+    CNode *node = NULL;
+
+    CPNodeConfig *pnodeConfig = clusterConfig_->GetFirstPNodeConfig();
+    for ( ; pnodeConfig; pnodeConfig = pnodeConfig->GetNext() )
+    {
+        pnid = pnodeConfig->GetPNid();
+        node = GetNode( pnid );
+        if (node && node->GetState() == State_Up )
+        {
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf("%s@%d" " - MyPNID=%d, config pnid=%d\n"
+                            , method_name, __LINE__
+                            , MyPNID, pnid );
+            }
+
+            if (pnid == MyPNID)
+            {
+                return( true);
+            }
+            else
+            {
+                break;
+            }
+        }
+    }
+
+    TRACE_EXIT;
+    return( false);
+}
+
 bool CNodeContainer::IsShutdownActive (void)
 {
     bool status = false;
@@ -3925,7 +4297,7 @@
             sprintf(la_buf, "[%s], Failed to load nameserver configuration.\n", method_name);
             mon_log_write(MON_NODECONT_LOAD_CONFIG_4, SQ_LOG_CRIT, la_buf);
             
-            abort();
+            mon_failure_exit();
         }
     }
 
diff --git a/core/sqf/monitor/linux/pnode.h b/core/sqf/monitor/linux/pnode.h
index b04e387..402bf40 100644
--- a/core/sqf/monitor/linux/pnode.h
+++ b/core/sqf/monitor/linux/pnode.h
@@ -43,10 +43,9 @@
 class CNode;
 
 typedef enum {
-    Phase_Ready=0,                          // Node ready for use
-    Phase_Activating,                       // Spare node going active
-    Phase_SoftDown,                         // Node soft down
-    Phase_SoftUp                            // Node soft up
+    Phase_Undefined=0                      // Node ready for use
+   ,Phase_Ready                            // Node ready for use
+   ,Phase_Activating                       // Spare node going active
 } NodePhase;
 typedef vector<int>     PNidVector;
 typedef list<CNode *>   NodesList;
@@ -68,9 +67,11 @@
     void    AddedNode( CNode *node );
 #ifndef NAMESERVER_PROCESS
     CProcess *AddCloneProcess( ProcessInfoNs_reply_def *processInfo );
+    void    AddConfiguredZNodes( void );
 #endif
     CNode  *AddNode( int pnid );
     void    AddNodes( void );
+    void    AddLNodes( void );
     void    AddToSpareNodesList( int pnid );
     CLNode *AssignLNode( CProcess *requester, PROCESSTYPE type, int nid, int not_nid );
     void    CancelDeathNotification( int nid
@@ -103,10 +104,12 @@
     inline int GetLNodesCount( void ) { return ( CLNodeContainer::GetLNodesCount() ); }
     inline int GetPNodesConfigMax( void ) { return ( clusterConfig_->GetPNodesConfigMax() ); }
     inline int GetPNodesCount( void ) { return ( pnodeCount_ ); }
+    int        GetPNodesUpCount( int &readyCount );
     inline int GetSNodesCount( void ) { return ( clusterConfig_->GetSNodesCount() ); }
     inline int GetAvailableSNodesCount( void ) { return ( spareNodesList_.size() ); }
 
     int     GetPNid( char *nodeName );
+    inline int GetPNidByMap( int index ){ return ( indexToPnid_[index] ); }
     CProcess *GetProcess( int nid, int pid, bool checknode=true );
     CProcess *GetProcess( int nid
                         , int pid
@@ -132,17 +135,19 @@
     SyncState GetTmState( SyncState check_state );
     CNode  *GetZoneNode( int zid );
 
+    void    InitRecvBuffer( struct sync_buffer_def *recvcBuf );
     struct internal_msg_def *InitSyncBuffer( struct sync_buffer_def *syncBuf
                                            , unsigned long long seqNum
                                            , upNodes_t upNodes );
-    int GetSyncSize() { return  sizeof(cluster_state_def_t)
+    int GetSyncSize(sync_buffer_def *sync_buffer) { return  sizeof(cluster_state_def_t)
                               + sizeof(msgInfo_t)
-                              + SyncBuffer->msgInfo.msg_offset; };
+                              + sync_buffer->msgInfo.msg_offset; };
     inline int GetSyncHdrSize() { return  sizeof(cluster_state_def_t)
                                           + sizeof(msgInfo_t); };
     struct sync_buffer_def * GetLastSyncBuffer() { return lastSyncBuffer_; };
     struct sync_buffer_def * GetSyncBuffer() { return SyncBuffer; };
     bool    IsShutdownActive( void );
+    bool    IsMyNodeFirstInConfigUp( void );
     void    KillAll( CProcess *process );
     void    LoadConfig( void );
     void    MarkStaleOpens( int nid, int pid );
@@ -195,8 +200,14 @@
     int            eyecatcher_;      // Debuggging aid -- leave as first
                                      // member variable of the class
 public:
-    CNode( char *name, int pnid, int rank );
     CNode( char *name
+         , char *domain
+         , char *fqdn
+         , int pnid
+         , int rank );
+    CNode( char *name
+         , char *domain
+         , char *fqdn
          , int   pnid
          , int   rank
          , int   sparePNidCount
@@ -243,6 +254,8 @@
     inline unsigned int GetBTime( void ) { return( bTime_ ); }
     inline CLNodeContainer *GetLNodeContainer( void ) { return( dynamic_cast<CLNodeContainer*>(this) ); }
     inline const char *GetHostname( void ) { return( hostname_.c_str() ); }
+    inline const char *GetDomain( void ) { return( domain_ ); }
+    inline const char *GetFqdn( void ) { return( fqdn_ ); }
     inline const char *GetName( void ) { return( name_ ); }
     inline int   GetPNid( void ) { return( pnid_ ); }
     inline NodePhase     GetPhase( void ) { return( phase_ ); }
@@ -271,8 +284,6 @@
     // the unique id as the value of the method.
     strId_t GetStringId( char *candidate, CLNode *targetLNode = NULL, bool clone = false );
 
-    inline int   GetTmSyncNid( void ) { return( tmSyncNid_ ); }
-    inline SyncState GetTmSyncState( void ) { return( tmSyncState_ ); }
     inline int   GetZone( void ) { return( zid_ ); }
     inline int   GetWDTKeepAliveTimerValue( void ) { return( wdtKeepAliveTimerValue_ ); }
     inline bool  IsActivatingSpare( void ) { return( activatingSpare_ ); }
@@ -280,18 +291,21 @@
     inline bool  IsDTMAborted( void ) { return( dtmAborted_ ); }
     inline bool  IsSMSAborted( void ) { return( smsAborted_ ); }
     inline bool  IsKillingNode( void ) { return( killingNode_ ); }
+    inline bool  IsPendingNodeDown( void ) { return( pendingNodeDown_ ); }
+    inline bool  IsPrimitivesReady( void ) { return( (primitiveDtmUp_ && primitivePsdUp_ && primitiveWdgUp_) ); }
     inline bool  IsRankFailure( void ) { return( rankFailure_ ); }
     inline bool  IsSpareNode( void ) { return( spareNode_ ); }
-    inline bool  IsSoftNodeDown( void ) { return( internalState_ == State_SoftDown ); }
-    inline bool  IsSoftNodeUp( void ) { return( internalState_ == State_SoftUp ); }
     inline bool  IsShutdownNameServer( void ) { return( shutdownNameServer_ ); }
 
-    CNode  *Link( CNode *entry );
+    CNode  *LinkAfter( CNode * &tail, CNode * entry );
+    CNode  *LinkBefore( CNode * &head, CNode * entry );
     void    MoveLNodes( CNode *targetNode );
     inline void ResetSpareNode( void ) { spareNode_ = false; }
-    void    ResetWatchdogTimer( void );
-    inline void ResetSoftNodeDown( void ) { internalState_ = State_Default; }
-    inline void ResetSoftNodeUp( void ) { internalState_ = State_Default; }
+    void        ResetWatchdogTimer( void );
+    inline void ResetPrimitiveDtmUp( void ) { primitiveDtmUp_ = false; }
+    inline void ResetPrimitivePsdUp( void ) { primitivePsdUp_ = false; }
+    inline void ResetPrimitiveWdgUp( void ) { primitiveWdgUp_ = false; }
+
     inline void SetActivatingSpare( int activatingSpare ) { activatingSpare_ = activatingSpare; }
     void    SetAffinity( int nid, pid_t pid, PROCESSTYPE type );
     void    SetAffinity( CProcess *process );
@@ -320,9 +334,8 @@
     inline void SetSMSAborted( bool smsAborted ) { smsAborted_ = smsAborted; }
     inline void SetKillingNode( bool killingNode ) { killingNode_ = killingNode; }
     inline void SetNumCores( int numCores ) { numCores_ = numCores; }
+    inline void SetPendingNodeDown(  bool pendingNodeDown ) { pendingNodeDown_ = pendingNodeDown; }
     inline void SetPhase( NodePhase phase ) { phase_ = phase; }
-    inline void SetSoftNodeDown( void ) { internalState_ = State_SoftDown; }
-    inline void SetSoftNodeUp( void ) { internalState_ = State_SoftUp; }
     inline void SetSparePNids( PNidVector &sparePNids ) { sparePNids_ = sparePNids; }
     inline void SetRank( int rank ) { rank_ = rank; }
     inline void SetRankFailure( bool failed ) { rankFailure_ = failed; 
@@ -344,10 +357,13 @@
     inline void SetShutdownNameServer( bool shutdown ) { shutdownNameServer_ = shutdown; }
     inline void SetShutdownLevel( ShutdownLevel level ) { shutdownLevel_ = level; }
     void SetState( STATE state );
-    inline void SetTmSyncNid( int nid ) { tmSyncNid_ = nid; }
-    inline void SetTmSyncState( SyncState syncState ) { tmSyncState_ = syncState; }
     inline void SetZone( int zid ) { zid_ = zid; }
     inline void SetName( char *newName ) { if (newName) strcpy (name_, newName); }
+    inline void SetPrimitiveDtmUp( void ) { primitiveDtmUp_ = true; }
+    inline void SetPrimitivePsdUp( void ) { primitivePsdUp_ = true; }
+    inline void SetPrimitiveWdgUp( void ) { primitiveWdgUp_ = true; }
+
+    void StartDtmProcess( void );
     void StartPStartDProcess( void );
     void StartPStartDPersistent( void );
     void StartPStartDPersistentDTM( int nid );
@@ -393,13 +409,20 @@
     unsigned int  freeCache_;    // amount of free buffer/cache in node
     unsigned int  memInfoData_[memFinalItem];
     unsigned int  bTime_;        // node boot time
-    char          name_[MPI_MAX_PROCESSOR_NAME]; // physical node name
+    char          domain_[MPI_MAX_PROCESSOR_NAME]; // domain name
+    char          fqdn_[MPI_MAX_PROCESSOR_NAME]; // Fully Qualified Domain Name (FQDN)
+    char          name_[MPI_MAX_PROCESSOR_NAME]; // short node name
     string        hostname_;     // physical node name without domain
     STATE         state_;        // Physical node's current operating state
     NodePhase     phase_;        // Physical node's current phase during spare node activation
     bool          killingNode_;  // true when down node in process
     bool          dtmAborted_;   // true when DTM process terminates abnormally
     bool          smsAborted_;   // true when SMS process terminates abnormally
+    bool          pendingNodeDown_;  // true when down node is processing is pending
+
+    bool          primitiveDtmUp_; // DTM running and startup sent
+    bool          primitivePsdUp_; // PStartD (PSD) running and startup sent
+    bool          primitiveWdgUp_; // Watchdog (WDG) running and startup sent
 
     CLNode       *lastLNode_;    // last logical node selected for process attach
     ShutdownLevel lastSdLevel_;  // last shutdown level
@@ -416,8 +439,6 @@
     CNode        *next_;
     CNode        *prev_;
     int           rank_;         // Node's Monitor rank in COMM_WORLD
-    int           tmSyncNid_;    // Logical Node of TM that initiated sync
-    SyncState     tmSyncState_;  // Sync operation state with TMs
     ShutdownLevel shutdownLevel_;
     bool          shutdownNameServer_; // true when monitor shutdown Name Server request is received
     int           wdtKeepAliveTimerValue_; // expiration time
@@ -455,6 +476,16 @@
         size_t remBytes;
     } bufInfo_t;
 
+    void    GetPersistProcessAttributes( CPersistConfig *persistConfig
+                                       , int             nid
+                                       , PROCESSTYPE    &processType
+                                       , char           *processName
+                                       , char           *programName
+                                       , int            &programArgc
+                                       , char           *programArgs
+                                       , char           *outfile
+                                       , char           *persistRetries
+                                       , char           *persistZones );
     bool NextMemInfoLine( bufInfo_t &inBuf, char * dataline );
 
     timespec      prevSchedData_;  // timestamp for when last acquired
diff --git a/core/sqf/monitor/linux/process.cxx b/core/sqf/monitor/linux/process.cxx
index f73b02e..e2ebc58 100644
--- a/core/sqf/monitor/linux/process.cxx
+++ b/core/sqf/monitor/linux/process.cxx
@@ -79,11 +79,14 @@
 
 extern bool IsAgentMode;
 extern bool IsMaster;
+extern bool IsRealCluster;
 
 extern bool PidMap;
 extern int Measure;
 extern int trace_level;
 extern int MyPNID;
+extern int ClusterId ;
+extern int InstanceId;
 extern char MyCommPort[MPI_MAX_PORT_NAME];
 extern char Node_name[MPI_MAX_PROCESSOR_NAME];
 extern sigset_t SigSet;
@@ -120,6 +123,7 @@
 
 extern const char *NodePhaseString( NodePhase phase );
 extern const char *ProcessTypeString( PROCESSTYPE type );
+extern const char *StateString( STATE state);
 
 extern int monitorArgc;
 extern char monitorArgv[MAX_ARGS][MAX_ARG_SIZE];
@@ -184,7 +188,6 @@
     prev_(NULL),
     nextL_(NULL),
     prevL_(NULL),
-    unsolTmSyncCount_(0),
     Last_error (MPI_SUCCESS)
     , argc_(0)
     , userArgvLen_ (0)
@@ -203,6 +206,7 @@
 #endif
     , firstInstance_(true)
     , cmpOrEsp_(false)
+    , trafRootZnode_()
     , trafConf_()
     , trafHome_()
     , trafLog_()
@@ -243,6 +247,19 @@
     if ( outfile && strcmp(outfile,"#default") != 0)
         outfile_ = outfile;
 
+    if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
+    {
+        trace_printf( "%s@%d - Process %s, infile=%s, infile_=%s\n"
+                    , method_name, __LINE__
+                    , Name, infile, infile_.c_str());
+    }
+    if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
+    {
+        trace_printf( "%s@%d - Process %s, outfile=%s, outfile_=%s\n"
+                    , method_name, __LINE__
+                    , Name, outfile, outfile_.c_str());
+    }
+
 #ifndef NAMESERVER_PROCESS
     Config->strIdToString(programStrId_, program_ );
 #endif
@@ -334,7 +351,22 @@
         }
     }
     if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL))
-       trace_printf("%s@%d" " - Process " "%s (nid=%d, priority=%d)" " created @ " "%p""\n", method_name, __LINE__, Name, Nid, Priority, this);
+    {
+        trace_printf( "%s@%d - Process %s created @ %p:\n"
+                      "                          nid       =%d\n"
+                      "                          priority  =%d\n"
+                      "                          type      =%s\n"
+                      "                          persistent=%d\n"
+                      "                          unhooked  =%d\n"
+                    , method_name, __LINE__
+                    , Name
+                    , this
+                    , Nid
+                    , Priority 
+                    , ProcessTypeString(Type)
+                    , Persistent
+                    , UnHooked );
+    }
 
     Monitor->IncProcessCount();
 
@@ -923,7 +955,11 @@
                 msg->u.reply.u.dump.verifier = Verifier;
                 if (status == Dump_Success)
                 {
-                    STRCPY(msg->u.reply.u.dump.core_file, core_file);
+                    char coreFile[MAX_PROCESS_PATH];
+                    CNode * node = Nodes->GetLNode(GetNid())->GetNode();
+                    snprintf( coreFile, sizeof(coreFile)
+                            , "%s:%s", node->GetFqdn(), core_file );
+                    STRCPY(msg->u.reply.u.dump.core_file, coreFile);
                     msg->u.reply.u.dump.return_code = MPI_SUCCESS;
                 }
                 else
@@ -1161,27 +1197,29 @@
             // let healthcheck thread know that the SMService process is up and running.
             HealthCheck.setState(HC_UPDATE_SMSERVICE, (long long)this);
         }
-        if ( Type == ProcessType_Watchdog )
+        else if ( Type == ProcessType_DTM )
         {
+            MyNode->SetPrimitiveDtmUp();
+        }
+        else if ( Type == ProcessType_Watchdog )
+        {
+            MyNode->SetPrimitiveWdgUp();
             // let healthcheck thread know that the watchdog process is up and running.
             HealthCheck.setState(HC_UPDATE_WATCHDOG, (long long)this);
             // start the watchdog timer
             HealthCheck.setState(MON_START_WATCHDOG);
         }
-        if ( Type == ProcessType_PSD &&
-            (IAmIntegrated || MyNode->IsActivatingSpare() || MyNode->IsSoftNodeDown()) )
+        else if ( Type == ProcessType_PSD )
         {
-             MyNode->StartPStartDPersistent();
+            MyNode->SetPrimitivePsdUp();
+            if(IsRealCluster)
+            {
+                MyNode->StartPStartDPersistent();
+            }
 
              if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_INIT))
                  trace_printf("%s%d: Sent start persistent processes event to PSD process %s (pid=%d)\n", method_name, __LINE__, GetName(), GetPid());
         }
-        if ( Type == ProcessType_DTM  &&
-             MyNode->IsSoftNodeDown() )
-        {
-            // Tell remote DTMs that this DTM was restarted
-            Monitor->SoftNodeUpPrepare( MyPNID );
-        }
     }
 
     TRACE_EXIT;
@@ -1241,6 +1279,12 @@
         if (!outfile_.empty())
         {
             STRCPY(Destfile, outfile_.c_str());
+            if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
+            {
+                trace_printf( "%s@%d - Process %s, Destfile=%s, outfile_=%s\n"
+                            , method_name, __LINE__
+                            , Name, Destfile, outfile_.c_str());
+            }
             TRACE_EXIT;
             return true;
         }
@@ -1250,6 +1294,12 @@
         if (!infile_.empty())
         {
             STRCPY(Destfile, infile_.c_str());
+            if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
+            {
+                trace_printf( "%s@%d - Process %s, Destfile=%s, outfile_=%s\n"
+                            , method_name, __LINE__
+                            , Name, Destfile, infile_.c_str());
+            }
             TRACE_EXIT;
             return true;
         }
@@ -1917,6 +1967,12 @@
     if (env && isdigit(*env))
        numProcessThreads = atoi(env);
 
+    env = getenv( "TRAF_ROOT_ZNODE" );
+    if (env)
+    {
+        trafRootZnode_ = env ;
+    }
+
     env = getenv( "TRAF_CONF" );
     if (env)
     {
@@ -2003,6 +2059,10 @@
         setEnvStrVal ( childEnv, nextEnv, "MPI_INSTR", filename );
     }
 
+    setEnvIntVal ( childEnv, nextEnv, "TRAF_CLUSTER_ID", ClusterId );
+    setEnvIntVal ( childEnv, nextEnv, "TRAF_INSTANCE_ID", InstanceId );
+
+    setEnvStrVal ( childEnv, nextEnv, "TRAF_ROOT_ZNODE", trafRootZnode_.c_str() );
     setEnvStrVal ( childEnv, nextEnv, "TRAF_CONF", trafConf_.c_str() );
     setEnvStrVal ( childEnv, nextEnv, "TRAF_HOME", trafHome_.c_str() );
     setEnvStrVal ( childEnv, nextEnv, "TRAF_LOG", trafLog_.c_str() );
@@ -2109,38 +2169,17 @@
     }
 
     string LDpath;
-    static bool sv_getenv_ld_library_path_done = false;
-    static string sv_ld_library_path;
-    if (IsAgentMode)
-    {
-        if (! sv_getenv_ld_library_path_done)
-        {
-            sv_getenv_ld_library_path_done = true;
-            sv_ld_library_path = getenv( "LD_LIBRARY_PATH" );
-            if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
-            {
-                trace_printf( "%s@%d" " - LD_LIBRARY_PATH = " "%s" "\n", method_name, __LINE__, sv_ld_library_path.c_str() );
-            }
-        }
-        LDpath = sv_ld_library_path;
-        if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
-        {
-            trace_printf( "%s@%d" " - LD_LIBRARY_PATH = " "%s" "\n", method_name, __LINE__, LDpath.c_str() );
-        }
-    }
-    else
-    {
-        if (ldpathStrId_.nid != -1)
-        {
-            Config->strIdToString( ldpathStrId_, LDpath );
-        }
-    }
+    Config->strIdToString( ldpathStrId_, LDpath );
+
     if (!LDpath.empty())
     {
         setEnvStrVal( childEnv, nextEnv, "LD_LIBRARY_PATH", LDpath.c_str( ) );
         if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
         {
             trace_printf( "%s@%d - LD_LIBRARY_PATH = %s\n", method_name, __LINE__, LDpath.c_str() );
+            trace_printf( "%s@%d - ldpathStrId_ = stringId(nid=%d, id=%d)\n"
+                        , method_name, __LINE__
+                        , ldpathStrId_.nid, ldpathStrId_.id );
         }
     }
 
@@ -2148,6 +2187,17 @@
 
     string program;
     Config->strIdToString ( programStrId_, program );
+    if (!program.empty( ))
+    {
+        if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
+        {
+            trace_printf( "%s@%d - Program = %s\n", method_name, __LINE__, program.c_str( ) );
+            trace_printf( "%s@%d - programStrId_ = stringId(nid=%d, id=%d)\n"
+                        , method_name, __LINE__
+                        , programStrId_.nid, programStrId_.id );
+        }
+    }
+   
     // temp for performance investigation
     if ( strstr(program.c_str(), "tdm_arkcmp") != NULL
       || strstr(program.c_str(), "tdm_arkesp") != NULL )
@@ -2181,36 +2231,15 @@
     }
 
     string path;
-    static bool sv_getenv_path_done = false;
-    static string sv_path;
-    if (IsAgentMode)
-    {
-        if (! sv_getenv_path_done)
-        {
-            sv_getenv_path_done = true;
-            sv_path = getenv( "PATH" );
-            if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
-            {
-                trace_printf( "%s@%d" " - PATH = " "%s" "\n", method_name, __LINE__, sv_path.c_str() );
-            }
-        }
-        path = sv_path;
-        if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
-        {
-            trace_printf( "%s@%d" " - PATH = " "%s" "\n", method_name, __LINE__, path.c_str() );
-        }
-    }
-    else
-    {
-        if (pathStrId_.nid != -1)
-        {
-            Config->strIdToString( pathStrId_, path );
-        }
-    }
+    Config->strIdToString( pathStrId_, path );
+
     setEnvStrVal( childEnv, nextEnv, "PATH", path.c_str( ) );
     if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
     {
         trace_printf( "%s@%d" " - PATH = " "%s" "\n", method_name, __LINE__, path.c_str() );
+        trace_printf( "%s@%d - pathStrId_ = stringId(nid=%d, id=%d)\n"
+                    , method_name, __LINE__
+                    , pathStrId_.nid, pathStrId_.id );
     }
 
     // Set values from registry as environment variables
@@ -2295,9 +2324,9 @@
     argv[j + 2] = new char[6];
     sprintf (argv[j + 2], "%5.5d", Nid);
 
-    argv[j + 3] = new char[7];
+    argv[j + 3] = new char[10];
     //sprintf (argv[j + 3], "%6.6d", Pid);
-    strcpy(argv[j + 3],"??????"); // The Pid will be assigned later, but we can't print it then.
+    strcpy(argv[j + 3],"????????"); // The Pid will be assigned later, but we can't print it then.
 
     argv[j + 4] = new char[strlen(Name) ? strlen(Name)+1 : MAX_PROCESS_NAME_STR];
     strcpy (argv[j + 4], Name);
@@ -2514,7 +2543,7 @@
             // or handled by a specific process on another node.
             int AncestorNid = -1;
             int AncestorPid = -1;
-            char Stdfile[MAX_PROCESS_PATH];
+            char Stdfile[MAX_PROCESS_PATH] = {0};
             if (PickStdfile(PICK_STDIN, Stdfile, AncestorNid, AncestorPid))
             {
                 Redirector.stdinFd(Nid, os_pid, pfds_stdin[1], Stdfile,
@@ -2545,7 +2574,7 @@
             // or handled by a specific process on another node.
             int AncestorNid = -1;
             int AncestorPid = -1;
-            char Stdfile[MAX_PROCESS_PATH];
+            char Stdfile[MAX_PROCESS_PATH] = {0};
             if (!PickStdfile(PICK_STDOUT, Stdfile, AncestorNid, AncestorPid))
             {  // Unable to locate stdout file.  So create a file based
                // on the process name and use that for output.
@@ -2945,10 +2974,43 @@
             DumperVerifier = dumper->Verifier;
             status = SUCCESS;
             if (trace_settings & TRACE_PROCESS)
-                trace_printf("%s@%d - DumpState=Dump_Pending, pid=%d\n",
-                             method_name, __LINE__, Pid);
-            repl = new CReplDump(this);
-            Replicator.addItem(repl);
+                trace_printf("%s@%d - DumpState=Dump_Pending, "
+                             "target %s (%d, %d:%d), "
+                             "dumper %s (%d, %d:%d), core path:%s\n"
+                             , method_name, __LINE__
+                             , Name
+                             , Nid
+                             , Pid
+                             , Verifier
+                             , dumper->Name
+                             , dumper->Nid
+                             , dumper->Pid
+                             , dumper->Verifier
+                             , core_path?core_path:"" );
+            if ( NameServerEnabled )
+            {
+                int rc = -1;
+
+                rc = PtpClient->ProcessDump(this);
+                if (rc)
+                {
+                    char la_buf[MON_STRING_BUF_SIZE];
+                    snprintf( la_buf, sizeof(la_buf)
+                            , "[%s] - Dump process request failed: "
+                              "target %s (%d, %d:%d)\n"
+                            , method_name
+                            , Name
+                            , Nid
+                            , Pid
+                            , Verifier );
+                    mon_log_write(MON_PROCESS_DUMP_1, SQ_LOG_ERR, la_buf);
+                }
+            }
+            else
+            {
+                repl = new CReplDump(this);
+                Replicator.addItem(repl);
+            }
             break;
 
         default:
@@ -3017,32 +3079,88 @@
             cmd = (char *) program.c_str();
         else
             cmd++; // past '/'
+
+        // Override core_path if directory specified in /proc/sys/kernel/core_pattern
+        FILE * procCorePatternFile;  // "/proc/sys/kernel/core_pattern" file descriptor
+
+        procCorePatternFile = fopen("/proc/sys/kernel/core_pattern", "r");
+        if (!procCorePatternFile)
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            sprintf(buf, "[%s], Cannot open /proc/sys/kernel/core_pattern, %s (%d)\n",
+                    method_name, strerror(errno), errno);
+            mon_log_write( MON_PROCESS_DUMP_BEGIN_1, SQ_LOG_ERR, buf );
+        }
+        else
+        {
+            char corePattern[132] = {0};
+            char corePath[132] = {0};
+
+            fgets( corePattern, 132, procCorePatternFile );
+            if (strlen( corePattern) )
+            {
+                strncpy( corePath, corePattern, sizeof(corePath) );
+                if (corePath[0] == '/')
+                {
+                    char *pch = strrchr( corePath, '/' );
+                    if (pch)
+                    {
+                        *pch = 0;
+                        strcpy( core_path, corePath );
+                    }
+                }
+                if (trace_settings & TRACE_PROCESS)
+                {
+                    trace_printf( "%s@%d - corePath=%s\n"
+                                , method_name, __LINE__, corePath );
+                    trace_printf( "%s@%d - core_path=%s\n"
+                                , method_name, __LINE__, core_path );
+                    trace_printf( "%s@%d - corePattern=%s\n"
+                                , method_name, __LINE__, corePattern );
+                }
+            }
+            else
+            {
+                if (trace_settings & TRACE_PROCESS)
+                {
+                    trace_printf("%s@%d - undefined corePattern=%s\n",
+                                 method_name, __LINE__, corePattern);
+                }
+            }
+            fclose(procCorePatternFile);
+        }
+
         // date=%Y-%m-%d_%H-%M-%S
         // core_file=<path>/core.<date>.<pname>.<pid>.<cmd>
-        snprintf(core_file, sizeof(core_file), "%s/core.%s.%s.%d.%s",
+        snprintf(core_file, sizeof(core_file), "%s/core.%s.%s.%d.%d.%s",
                 core_path,
                 date,
                 &Name[1],
+                Nid,
                 Pid,
                 cmd);
         corefile_ = core_file;
 
         if (trace_settings & TRACE_PROCESS)
-            trace_printf("%s@%d - starting mondump for pid=%d, core-file=%s\n",
-                         method_name, __LINE__, Pid, core_file);
+            trace_printf( "%s@%d - starting mondump - "
+                          "target %s (%d, %d:%d), "
+                          "dumper (%d, %d:%d), "
+                          "core-file=%s\n"
+                        , method_name, __LINE__
+                        , Name
+                        , Nid
+                        , Pid
+                        , Verifier
+                        , DumperNid
+                        , DumperPid
+                        , DumperVerifier
+                        , core_file);
 
         argv[0] = (char *) "mondump";
         snprintf(core_pid, sizeof(core_pid), "%d", Pid);
         argv[1] = core_pid;
         argv[2] = core_file;
-        if ((nid == Nid) || getenv("SQ_VIRTUAL_NODES"))
-           argv[3] = NULL;
-        else
-        {
-           argv[3] = (char *) Nodes->GetNode(Nid)->GetName();
-           argv[4] = getenv("MPI_TMPDIR");
-           argv[5] = NULL;
-        }
+        argv[3] = NULL;
         CLNode   *lnode = Nodes->GetLNode( Nid );
         err = IntProcess.create(argv[0],
                                 argv,
@@ -3058,8 +3176,34 @@
         else
         {
             DumpState = Dump_Complete;
-            CReplDumpComplete *repl = new CReplDumpComplete(this);
-            Replicator.addItem(repl);
+            if ( NameServerEnabled )
+            {
+                int rc = -1;
+    
+                rc = PtpClient->ProcessDumpComplete(this);
+                if (rc)
+                {
+                    char la_buf[MON_STRING_BUF_SIZE];
+                    snprintf( la_buf, sizeof(la_buf)
+                            , "[%s] - Dump complete reply to dumper failed: "
+                              "target %s (%d, %d:%d), "
+                              "dumper (%d, %d:%d)\n"
+                            , method_name
+                            , Name
+                            , Nid
+                            , Pid
+                            , Verifier
+                            , DumperNid
+                            , DumperPid
+                            , DumperVerifier );
+                    mon_log_write(MON_PROCESS_DUMP_BEGIN_2, SQ_LOG_ERR, la_buf);
+                }
+            }
+            else
+            {
+                CReplDumpComplete *repl = new CReplDumpComplete(this);
+                Replicator.addItem(repl);
+            }
             CompleteDump(Dump_Failed, NULL);
         }
     }
@@ -3067,11 +3211,29 @@
     if (trace_settings & TRACE_PROCESS)
     {
         if (DumpState == Dump_InProgress)
-            trace_printf("%s@%d - DumpState=Dump_InProgress, pid=%d\n",
-                         method_name, __LINE__, Pid);
+            trace_printf( "%s@%d - DumpState=Dump_InProgress, "
+                          "target %s (%d, %d:%d), "
+                          "dumper (%d, %d:%d)\n"
+                        , method_name, __LINE__
+                        , Name
+                        , Nid
+                        , Pid
+                        , Verifier
+                        , DumperNid
+                        , DumperPid
+                        , DumperVerifier );
         else
-            trace_printf("%s@%d - DumpState=Dump_Complete, pid=%d\n",
-                         method_name, __LINE__, Pid);
+            trace_printf( "%s@%d - DumpState=Dump_Complete, "
+                          "target %s (%d, %d:%d), "
+                          "dumper (%d, %d:%d)\n"
+                        , method_name, __LINE__
+                        , Name
+                        , Nid
+                        , Pid
+                        , Verifier
+                        , DumperNid
+                        , DumperPid
+                        , DumperVerifier );
     }
 
     TRACE_EXIT;
@@ -3244,9 +3406,7 @@
     {
         CNode * node = Nodes->GetLNode(GetNid())->GetNode();
         // if process' node is being killed, do not supply process death notices
-        supplyProcessDeathNotices = node->IsSoftNodeDown()
-                                        ? node->IsSoftNodeDown()
-                                        : !node->IsKillingNode();
+        supplyProcessDeathNotices = !node->IsKillingNode();
     }
 
     if(  NoticeHead &&
@@ -3417,12 +3577,14 @@
             case ProcessType_SPX:
             case ProcessType_PSD:
             case ProcessType_PERSIST:
+            case ProcessType_TMID:
                 // No special handling needed on exit
                 break;
             default:
-
-                snprintf(la_buf, sizeof(la_buf),
-                         "[CProcess::Exit], Invalid process type!\n");
+                snprintf(la_buf, sizeof(la_buf)
+                        , "[CProcess::Exit], Invalid process type(%d)! "
+                          "%s (%d,%d:%d)\n"
+                        , Type, Name , Nid, Pid, Verifier);
                 mon_log_write(MON_PROCESS_EXIT_1, SQ_LOG_ERR, la_buf);
         }
 
@@ -3482,44 +3644,11 @@
         }
 
         if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL))
-            trace_printf( "%s@%d" " - Death message check of %s (%d,%d:%d) type=%s, node phase=%s, send death notices=%d\n"
+            trace_printf( "%s@%d" " - Death message check of %s (%d,%d:%d) type=%s, My node phase=%s, send death notices=%d\n"
                         , method_name, __LINE__
                         , GetName(), GetNid(), GetPid(), GetVerifier()
                         , ProcessTypeString(GetType()), NodePhaseString( MyNode->GetPhase() )
                         , supplyProcessDeathNotices );
-
-        if ( Type == ProcessType_DTM &&
-             MyNode->GetPhase() == Phase_Ready &&
-             supplyProcessDeathNotices )
-        {
-            // Send local DTMs this DTM's death message
-            CLNode *lnode = MyNode->GetFirstLNode();
-            for ( ; lnode; lnode = lnode->GetNextP() )
-            {
-                CProcess *tmProcess = lnode->GetProcessLByType( ProcessType_DTM );
-                if ( tmProcess && MyNode->GetState() == State_Up )
-                {
-                    SQ_theLocalIOToClient->putOnNoticeQueue( tmProcess->Pid
-                                                           , tmProcess->Verifier
-                                                           , DeathMessage()
-                                                           , NULL);
-
-                    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
-                       trace_printf( "%s@%d" " - Sending death message of %s (%d,%d:%d) to %s (%d,%d:%d)\n"
-                                   , method_name, __LINE__
-                                   , GetName(), GetNid(), GetPid(), GetVerifier()
-                                   , tmProcess->GetName(), tmProcess->GetNid()
-                                   , tmProcess->GetPid(), tmProcess->GetVerifier());
-
-                }
-                else
-                {
-                    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL))
-                        trace_printf("%s@%d: No DTM process found in nid=%d\n",
-                                     method_name, __LINE__, lnode->GetNid());
-                }
-            }
-        }
     }
 
     if ( parent && !parent->IsClone() && Pid != -1 )
@@ -3836,38 +3965,6 @@
 }
 
 #ifndef NAMESERVER_PROCESS
-bool CProcess::MyTransactions( struct message_def *msg )
-{
-    int idx;
-    CNotice *notice = NoticeHead;
-
-    const char method_name[] = "CProcess::MyTransactions";
-    TRACE_ENTRY;
-
-    while (notice)
-    {
-        if ( !isNull( notice->TransID ) )
-        {
-            idx = msg->u.reply.u.trans_info.num_processes;
-            msg->u.reply.u.trans_info.procs[idx].nid = notice->Nid;
-            msg->u.reply.u.trans_info.procs[idx].pid = notice->Pid;
-            msg->u.reply.u.trans_info.procs[idx].trans_id = notice->TransID;
-            msg->u.reply.u.trans_info.num_processes++;
-            if (msg->u.reply.u.trans_info.num_processes >= MAX_PROC_LIST)
-            {
-                msg->u.reply.u.trans_info.return_code = MPI_ERR_TRUNCATE;
-                return FAILURE;
-            }
-        }
-        notice = notice->GetNext();
-    }
-
-    TRACE_EXIT;
-    return SUCCESS;
-}
-#endif
-
-#ifndef NAMESERVER_PROCESS
 bool CProcess::Open (CProcess * opened_process, int death_notification)
 {
     const char method_name[] = "CProcess::Open";
@@ -4153,7 +4250,8 @@
         mon_log_write(MON_PROCESSCONT_PROCESSCONT_1, SQ_LOG_ERR, buf);
 
         sem_unlink(sem_name);
-        abort();
+
+        mon_failure_exit();
     }
 
 #ifndef NAMESERVER_PROCESS
@@ -4203,7 +4301,8 @@
                      method_name, sem_name, strerror(err));
             mon_log_write(MON_PROCESSCONT_PROCESSCONT_4, SQ_LOG_ERR, buf);
         }
-        abort();
+
+        mon_failure_exit();
     }
 
 #ifndef NAMESERVER_PROCESS
@@ -4631,7 +4730,7 @@
             if ( ! MyNode->IsSpareNode() )
             {
                 int nid = MyNode->AssignNid();
-                if ( (nid == -1) && (MyNode->GetState() != State_Up) )
+                if ( nid == -1 )
                 {
                     snprintf( la_buf, sizeof(la_buf),
                             "[%s], Can't attach the pid %d (program: %s) - the monitor is not up yet (curr state: %d).\n",
@@ -4832,6 +4931,12 @@
     const char method_name[] = "CProcessContainer::BuildOurName";
     TRACE_ENTRY;
 
+    // We are skipping 'A', 'I', 'O', and 'U' to distinguish between zero
+    // and one digits, and for political correctness in generated names
+    static char b32table[32] =  {'0','1','2','3','4','5','6','7','8','9'
+                                ,'B','C','D','E','F','G','H','J','K','L','M'
+                                ,'N','P','Q','R','S','T','V','W','X','Y','Z' };
+
     int i;
     int rem;
     int cnt[6];
@@ -4877,13 +4982,25 @@
     }
     else
     {
-        // We are skipping 'A', 'I', 'O', and 'U' to distinguish between zero
-        // and one digits, and for political correctness in generated names
-        char b32table[32] =  {'0','1','2','3','4','5','6','7','8','9'
-                             ,'B','C','D','E','F','G','H','J','K','L','M'
-                             ,'N','P','Q','R','S','T','V','W','X','Y','Z' };
+        // Process name format long: '$Zxxxxpppppp' xxxx = <nid>, pppppp = <pid>
+        sprintf(name,"$Z");
     
-        // Convert Pid into base 32 ascii
+        // Convert <nid> into base 32 (1,048,575)
+        cnt[0] = nid / 32768;       // (32 * 32 * 32)
+        rem = nid - ( cnt[0] * 32768 );
+        cnt[1] = rem / 1024;        // (32 * 32)
+        rem -= ( cnt[1] * 1024 );
+        cnt[2] = rem / 32;
+        rem -= ( cnt[2] * 32 );
+        cnt[3] = rem;
+    
+        // Convert <nid> into base 32 ascii
+        for(i=3; i>=0; i--)
+        {
+            name[i+2] = static_cast<char>(b32table[cnt[i]]);
+        }
+    
+        // Convert <pid> into base 32 (1,073,741,823)
         cnt[0] = pid / 33554432;    // (32 * 32 * 32 * 32 * 32)
         rem = pid - ( cnt[0] * 33554432 );
         cnt[1] = rem / 1048576;     // (32 * 32 * 32 * 32)
@@ -4896,12 +5013,7 @@
         rem -= ( cnt[4] * 32 );
         cnt[5] = rem;
     
-        // Process name format long: '$Zxxxxpppppp' xxxx = nid, pppppp = pid
-    
-        // Convert Nid into base 16 ascii
-        sprintf(name,"$Z%4.4X",nid);
-    
-        // Convert Pid into base 32 ascii
+        // Convert <pid> into base 32 ascii
         for(i=5; i>=0; i--)
         {
             name[i+6] = static_cast<char>(b32table[cnt[i]]);
@@ -5466,7 +5578,8 @@
     if (process)
     {
         AddToList( process );
-        if (type == ProcessType_NameServer ||
+        if (type == ProcessType_DTM ||
+            type == ProcessType_NameServer ||
             type == ProcessType_Watchdog ||
             type == ProcessType_PSD ||
             type == ProcessType_SMS )
@@ -5916,7 +6029,7 @@
         {
             process->SetState (State_Stopped);
             if ( !process->IsClone() &&
-                 (!MyNode->IsKillingNode() || MyNode->IsSoftNodeDown()) &&
+                 !MyNode->IsKillingNode() &&
                  !MyNode->isInQuiesceState() &&
                  !(process->GetType() == ProcessType_DTM &&
                    process->IsAbended() &&
@@ -6661,27 +6774,48 @@
     {
         char buf[MON_STRING_BUF_SIZE];
         snprintf( buf, sizeof(buf)
-                , "[%s], Persistent process %s not "
-                  "restarted because the persist configuration is "
-                  "missing.\n"
+                , "[%s], Persistent process %s (%s) not restarted on nid=%d "
+                  "because the persist configuration is missing.\n"
                 , method_name
-                , process->GetName() );
+                , process->GetName()
+                , ProcessTypeString( process->GetType() )
+                , process->GetNid() );
         mon_log_write(MON_PROCESS_PERSIST_2, SQ_LOG_ERR, buf);
         return false;
     }
 
+    if (!process->IsClone())
+    {
+        if ( process->GetType() == ProcessType_DTM )
+        {
+            MyNode->ResetPrimitiveDtmUp();
+        }
+        else if ( process->GetType() == ProcessType_Watchdog )
+        {
+            MyNode->ResetPrimitiveWdgUp();
+        }
+        else if ( process->GetType() == ProcessType_PSD )
+        {
+             MyNode->ResetPrimitivePsdUp() ;
+        }
+    }
+
     // if 1st time retrying to restart process
     if (process->GetPersistentCreateTime() == 0)
     {
-        process->SetFirstInstance(false);
         process->SetPersistentCreateTime ( time(NULL) );
     }
 
     if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
-       trace_printf( "%s@%d - Persistent process retries = %d, "
-                     "time limit = %d, down nid=%d\n"
+       trace_printf( "%s@%d - Persistent process %s, retryCount=%d, max_retries = %d, "
+                     "time limit = %d, createTime=%ld, down nid=%d\n"
                    , method_name, __LINE__
-                   , max_retries, retry_max_time, downNid);
+                   , process->GetName()
+                   , process->GetPersistentRetries()
+                   , max_retries
+                   , retry_max_time
+                   , process->GetPersistentCreateTime()
+                   , downNid);
 
     // get the parent process if any
     if (process->GetParentNid() != -1 && process->GetParentPid() != -1)
@@ -6874,9 +7008,9 @@
             if ( (time(NULL) - process->GetPersistentCreateTime()) < retry_max_time )
             {
                 int retryCount = process->GetPersistentRetries();
-                if ( retryCount < max_retries )
+                ++retryCount;
+                if ( retryCount <= max_retries )
                 {
-                    ++retryCount;
                     process->SetPersistentRetries ( retryCount );
                 }
                 else
@@ -6890,10 +7024,12 @@
 
                     char buf[MON_STRING_BUF_SIZE];
 
-                    snprintf(buf, sizeof(buf), "[%s], Persistent process %s "
-                             "not restarted because the maximum retry count "
-                             "(%d) has been exceeded.\n",
-                             method_name, process->GetName(), retryCount);
+                    snprintf( buf, sizeof(buf)
+                            , "[%s], Persistent process %s not restarted because "
+                              "the maximum retry count has been exceeded. "
+                              "(retryCount=%d, maxRetryCount=%d) \n"
+                            , method_name, process->GetName()
+                            , retryCount, max_retries );
                     mon_log_write(MON_PROCESS_PERSIST_1, SQ_LOG_INFO, buf);
 
                     if ( process->GetType() == ProcessType_DTM ||
@@ -6925,15 +7061,15 @@
             }
             else
             {
-                process->SetPersistentRetries ( 0 );
                 if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
-                    trace_printf("%s@%d" " - Retries count reset for process " "%s" "\n", method_name, __LINE__, process->GetName());
-            }
-
-            if ( process->GetType() == ProcessType_DTM )
-            {
-                // Kill all local processes
-                Monitor->SoftNodeDown( MyPNID );
+                    trace_printf( "%s@%d - Retries count reset for process %s, "
+                                  "retry: count=%d, time elapsed=%ld, time max=%d\n"
+                                , method_name, __LINE__
+                                , process->GetName()
+                                , process->GetPersistentRetries()
+                                , (time(NULL) - process->GetPersistentCreateTime())
+                                , retry_max_time);
+                process->SetPersistentRetries( 1 );
             }
 
             // OK ... just restart the process on the same node
@@ -6952,7 +7088,10 @@
                     ->AddToNameMap(process);
                 Nodes->GetLNode (process->GetNid())->GetNode()
                     ->AddToPidMap(process->GetPid(), process);
-                process->SetPersistentCreateTime ( time(NULL) );
+                if (process->GetPersistentRetries() == 1)
+                {
+                    process->SetPersistentCreateTime ( time(NULL) );
+                }
                 if ( process->GetType() == ProcessType_SSMP )
                 {
                     Nodes->GetLNode ( process->GetNid() )->SetSSMProc ( process );
@@ -7125,14 +7264,12 @@
         {
             if (errno == ESRCH)
             {   // Process no longer exists
-                if (trace_settings & TRACE_PROCESS)
-                    trace_printf("%s@%d process %d no longer exists\n",
-                                 method_name, __LINE__, pid);
-                // Log info
-                snprintf(buf, sizeof(buf),
-                         "[%s], process %d no longer exists, initiating "
-                         "exit processing\n", method_name, pid);
-                mon_log_write(MON_PROCESS_PIDHANGUPCHECK_1, SQ_LOG_INFO, buf);
+                if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_EVLOG_MSG))
+                {
+                    trace_printf("%s@%d process %d no longer exists, initiating "
+                                 "exit processing\n"
+                                , method_name, __LINE__, pid);
+                }
 
                 // Remove from set
                 hungupPids_.erase ( pid );
@@ -7215,12 +7352,14 @@
             // Process intends to exits, when the child death arrives the
             // State_Stopped is processed
             if (trace_settings & TRACE_PROCESS)
-                trace_printf( "%s@%d Setting State_Down for process %s(%d,%d:%d), abend=%d, down=%d\n"
+                trace_printf( "%s@%d Setting State_Down for process %s(%d,%d:%d),"
+                              "state=%s, abend=%d, down=%d\n"
                             , method_name, __LINE__
                             , process->GetName()
                             , process->GetNid()
                             , process->GetPid()
                             , process->GetVerifier()
+                            , StateString(process->GetState())
                             , abend, downNode );
             process->SetState( State_Down );
             if ( abend && !process->IsAbended() )
@@ -7228,7 +7367,6 @@
                 process->SetAbended( abend );
             }
             break;
-
         case State_Stopped:
             if ( process->GetState() != State_Stopped )
             {
@@ -7314,43 +7452,3 @@
     TRACE_EXIT;
 }
 
-
-
-#ifndef NAMESERVER_PROCESS
-bool CProcessContainer::WhoEnlisted( _TM_Txid_External trans_id, struct message_def *msg )
-{
-    int idx;
-    CProcess *process = head_;
-    CNotice *notice;
-
-    const char method_name[] = "CProcessContainer::WhoEnlisted";
-    TRACE_ENTRY;
-    while ((process) &&
-           (msg->u.reply.u.trans_info.num_processes < MAX_PROC_LIST ))
-    {
-        notice = process->GetNoticeHead();
-        while (notice)
-        {
-            if ( isEqual( notice->TransID, trans_id ) )
-            {
-                idx = msg->u.reply.u.trans_info.num_processes;
-                msg->u.reply.u.trans_info.procs[idx].nid = process->GetNid();
-                msg->u.reply.u.trans_info.procs[idx].pid = process->GetPid();
-                msg->u.reply.u.trans_info.procs[idx].trans_id = trans_id;
-                msg->u.reply.u.trans_info.num_processes++;
-                if (msg->u.reply.u.trans_info.num_processes >= MAX_PROC_LIST)
-                {
-                    msg->u.reply.u.trans_info.return_code = MPI_ERR_TRUNCATE;
-                    return FAILURE;
-                }
-                break;
-            }
-            notice = notice->GetNext();
-        }
-        process = process->GetNext();
-    }
-
-    TRACE_EXIT;
-    return SUCCESS;
-}
-#endif
diff --git a/core/sqf/monitor/linux/process.h b/core/sqf/monitor/linux/process.h
index fbe7db3..0f827b8 100644
--- a/core/sqf/monitor/linux/process.h
+++ b/core/sqf/monitor/linux/process.h
@@ -179,7 +179,6 @@
     inline  void SetLastProcess( CProcess *process ) { tail_ = process; }
     inline void SetNameMap( nameMap_t *nameMap ) { nameMap_ = nameMap; };
     inline void SetPidMap( pidMap_t *pidMap ) { pidMap_ = pidMap; };
-    bool WhoEnlisted( _TM_Txid_External trans_id, struct message_def *msg );
     bool WhoOpenedMe( CProcess *process, struct message_def *msg );
     bool WhoOpenedMe( CProcess *process, struct message_def *msg, CProcess *entry );
     bool WhoOpenedMe( CProcess *process, int pid, struct message_def *msg );
@@ -264,7 +263,7 @@
     void CompleteRequest( int status );
     bool Create (CProcess *parent, void* tag, int & result);
     bool Dump (CProcess *dumper, char *core_path);
-    void DumpBegin(int nid, int pid, Verifier_t verifier, char *core_path);
+    void DumpBegin(int dumperNid, int dumperPid, Verifier_t dumperVerifier, char *core_path);
     void DumpEnd(DUMPSTATUS status, char *core_file);
     void Exit( CProcess *parent );
     void GenerateEvent( int event_id, int length, char *data );
@@ -360,11 +359,7 @@
     inline void  SetPriorPid ( pid_t pid ) { priorPid_ = pid; }
     CProcess *GetProcessLByType( PROCESSTYPE type );
     inline STATE GetState() { return State_; }
-    inline void IncrUnsolTmSyncCount() { ++unsolTmSyncCount_; }
-    inline int  DecrUnsolTmSyncCount() { --unsolTmSyncCount_; return unsolTmSyncCount_; }
-    inline int  GetUnsolTmSyncCount() { return unsolTmSyncCount_; }
     bool MakePrimary(void);
-    bool MyTransactions( struct message_def *msg );
     bool Open( CProcess *opened_process, int death_notification );
 #ifndef NAMESERVER_PROCESS
     CNotice *RegisterDeathNotification( int nid
@@ -527,7 +522,6 @@
     CProcess    *prev_;     // previous process in logial node container list
     CProcess    *nextL_;    // next process in physical node container list
     CProcess    *prevL_;    // previous process in physical node container list
-    int          unsolTmSyncCount_;
 
     int          Last_error;
 
@@ -543,6 +537,7 @@
     strId_t      ldpathStrId_;
     bool         firstInstance_; // reset on persistent process re-creation
     bool         cmpOrEsp_;
+    string       trafRootZnode_;  // TRAF_ROOT_ZNODE passed to object file
     string       trafConf_;     // TRAF_CONF passed to object file
     string       trafHome_;     // TRAF_HOME passed to object file
     string       trafLog_;      // TRAF_LOG passed to object file
diff --git a/core/sqf/monitor/linux/pstartd.cxx b/core/sqf/monitor/linux/pstartd.cxx
index 74b35f3..eb78d91 100644
--- a/core/sqf/monitor/linux/pstartd.cxx
+++ b/core/sqf/monitor/linux/pstartd.cxx
@@ -93,9 +93,6 @@
         case MsgType_NodeJoining:
             str = "MsgType_NodeJoining";
             break;
-        case MsgType_NodePrepare:
-            str = "MsgType_NodePrepare";
-            break;
         case MsgType_NodeQuiesce:
             str = "MsgType_NodeQuiesce";
             break;
@@ -123,18 +120,6 @@
         case MsgType_SpareUp:
             str = "MsgType_SpareUp";
             break;
-        case MsgType_TmRestarted:
-            str = "MsgType_TmRestarted";
-            break;
-        case MsgType_TmSyncAbort:
-            str = "MsgType_TmSyncAbort";
-            break;
-        case MsgType_TmSyncCommit:
-            str = "MsgType_TmSyncCommit";
-            break;
-        case MsgType_UnsolicitedMessage:
-            str = "MsgType_UnsolicitedMessage";
-            break;
         default:
             str = "MsgType - Undefined";
             break;
@@ -166,8 +151,6 @@
     case MsgType_NodeDown:
     case MsgType_NodeQuiesce:
     case MsgType_NodeJoining:
-    case MsgType_TmSyncAbort:
-    case MsgType_TmSyncCommit:
         if ( tracing )
         {
             trace_printf( "%s@%d CB Notice: Type=%d\n",
@@ -877,11 +860,14 @@
 
 CRequest * CPStartD::getReq ( )
 {
-    CRequest *req;
+    CRequest *req = NULL;
     CAutoLock autoLock(getLocker());
 
-    req = workQ_.front();
-    workQ_.pop_front();
+    if (workQ_.size())
+    {
+        req = workQ_.front();
+        workQ_.pop_front();
+    }
 
     return req;
 }
@@ -911,13 +897,13 @@
         if ( ! ClusterConfig.LoadConfig() )
         {
             printf("[%s], Failed to load cluster configuration.\n", MyName);
-            abort();
+            exit(EXIT_FAILURE);
         }
     }
     else
     {
         printf( "[%s] Warning: No cluster.conf found\n",MyName);
-        abort();
+        exit(EXIT_FAILURE);
     }
 
     return true;
@@ -1116,7 +1102,7 @@
             case ProcessType_NameServer:
             case ProcessType_Watchdog:
             default:
-                // Skip these, they are managed by DTM Lead and monitor processes
+                // Skip these, they are managed by the monitor
                 if ( tracing )
                 {
                     trace_printf("%s@%d Persist type %s NOT targeted for restart\n",
@@ -1154,7 +1140,7 @@
     }
 
     // Determine trace file name
-    const char *tmpDir = getenv( "MPI_TMPDIR" );
+    const char *tmpDir = getenv( "TRAF_LOG" );
     snprintf( traceFileName, sizeof(traceFileName),
               "%s/pstartd.trace.%d", ((tmpDir != NULL) ? tmpDir : currentDir),
               getpid() );
@@ -1240,13 +1226,6 @@
     MyNid = monUtil.getNid();
     MyPid = monUtil.getPid();
 
-    // Set flag to indicate whether we are operating in a real cluster
-    // or a virtual cluster.
-    if ( getenv("SQ_VIRTUAL_NODES") )
-    {
-        IsRealCluster = false;
-    }
-
     MonLog = new CMonLog( "log4cxx.monitor.psd.config", "PSD", "alt.pstartd", MyPNID, MyNid, MyPid, MyName );
 
     pStartD = new CPStartD;
diff --git a/core/sqf/monitor/linux/ptpclient.cxx b/core/sqf/monitor/linux/ptpclient.cxx
index 39e4443..f2aa887 100644
--- a/core/sqf/monitor/linux/ptpclient.cxx
+++ b/core/sqf/monitor/linux/ptpclient.cxx
@@ -88,7 +88,8 @@
                 , "[%s@%d] MON2MON_COMM_PORT environment variable is not set!\n"
                 , method_name, __LINE__ );
         mon_log_write( PTPCLIENT_PTPCLIENT_1, SQ_LOG_CRIT, buf );
-        abort();
+
+        mon_failure_exit();
     }
 
     ptpClusterSocks_ = new int[MAX_NODES];
@@ -392,6 +393,155 @@
     return error;
 }
 
+int CPtpClient::ProcessDump( CProcess *process )
+{
+    const char method_name[] = "CPtpClient::ProcessDump";
+    TRACE_ENTRY;
+
+    if (!IsTargetRemote( process->GetNid() ))
+    {
+        if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+        {
+            trace_printf( "%s@%d - Not Sending InternalType_Dump request to "
+                          "local nid=%d\n"
+                        , method_name, __LINE__
+                        , process->GetNid() );
+        }
+        return(0);
+    }
+
+    int targetNid = process->GetNid();
+    CNode *targetNode = Nodes->GetLNode(targetNid)->GetNode();
+
+    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+    {
+        trace_printf( "%s@%d - Sending InternalType_Dump request to %s, targetPNid=%d"
+                      ", target process=%s (%d,%d:%d)\n"
+                    , method_name, __LINE__
+                    , targetNode?targetNode->GetName():""
+                    , targetNode?targetNode->GetPNid():-1
+                    , process->GetName()
+                    , process->GetNid()
+                    , process->GetPid()
+                    , process->GetVerifier() );
+    }
+
+    struct internal_msg_def msg;
+    memset(&msg, 0, sizeof(msg)); 
+    msg.type = InternalType_Dump;
+    msg.u.dump.nid = process->GetNid();
+    msg.u.dump.pid = process->GetPid();
+    msg.u.dump.verifier = process->GetVerifier();
+    msg.u.dump.dumper_nid = process->GetDumperNid();
+    msg.u.dump.dumper_pid = process->GetDumperPid();
+    msg.u.dump.dumper_verifier = process->GetDumperVerifier();
+    strcpy(msg.u.dump.core_file, process->GetDumpFile());
+
+    ptpMsgInfo_t myInfo;
+    myInfo.pnid = MyPNID;
+    myInfo.size = offsetof(struct internal_msg_def, u);
+    myInfo.size += sizeof(msg.u.dump);
+    
+    if (trace_settings & TRACE_PROCESS_DETAIL)
+    {
+        trace_printf( "%s@%d - size_=%d, process %s (%d,%d:%d), "
+                      "dumper (%d,%d:%d), core_file=%s\n"
+                    , method_name, __LINE__
+                    , myInfo.size
+                    , process->GetName()
+                    , msg.u.dump.nid
+                    , msg.u.dump.pid
+                    , msg.u.dump.verifier
+                    , msg.u.dump.dumper_nid
+                    , msg.u.dump.dumper_pid
+                    , msg.u.dump.dumper_verifier
+                    , msg.u.dump.core_file );
+    }
+
+    int error = SendToMon( "process-dump"
+                         , &msg
+                         , myInfo
+                         , targetNid
+                         , targetNode->GetName() );
+    
+    TRACE_EXIT;
+    return error;
+}
+
+int CPtpClient::ProcessDumpComplete( CProcess *process )
+{
+    const char method_name[] = "CPtpClient::ProcessDumpComplete";
+    TRACE_ENTRY;
+
+    if (!IsTargetRemote( process->GetDumperNid() ))
+    {
+        if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+        {
+            trace_printf( "%s@%d - Not Sending InternalType_Dump request to "
+                          "local nid=%d\n"
+                        , method_name, __LINE__
+                        , process->GetDumperNid() );
+        }
+        return(0);
+    }
+
+    int targetNid = process->GetDumperNid();
+    CNode *targetNode = Nodes->GetLNode(targetNid)->GetNode();
+
+    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+    {
+        trace_printf( "%s@%d - Sending InternalType_DumpComplete reply to %s, targetPNid=%d"
+                      ", dumper process (%d,%d:%d)\n"
+                    , method_name, __LINE__
+                    , targetNode?targetNode->GetName():""
+                    , targetNode?targetNode->GetPNid():-1
+                    , process->GetDumperNid()
+                    , process->GetDumperPid()
+                    , process->GetDumperVerifier() );
+    }
+
+    struct internal_msg_def msg;
+    memset(&msg, 0, sizeof(msg)); 
+    msg.type = InternalType_DumpComplete;
+    msg.u.dump.nid = process->GetNid();
+    msg.u.dump.pid = process->GetPid();
+    msg.u.dump.verifier = process->GetVerifier();
+    msg.u.dump.dumper_nid = process->GetDumperNid();
+    msg.u.dump.dumper_pid = process->GetDumperPid();
+    msg.u.dump.dumper_verifier = process->GetDumperVerifier();
+    strcpy(msg.u.dump.core_file, process->GetDumpFile());
+
+    ptpMsgInfo_t myInfo;
+    myInfo.pnid = MyPNID;
+    myInfo.size = offsetof(struct internal_msg_def, u);
+    myInfo.size += sizeof(msg.u.dump);
+    
+    if (trace_settings & TRACE_PROCESS_DETAIL)
+    {
+        trace_printf( "%s@%d - size_=%d, process %s (%d,%d:%d), "
+                      "dumper (%d,%d:%d), core_file=%s\n"
+                    , method_name, __LINE__
+                    , myInfo.size
+                    , process->GetName()
+                    , msg.u.dump.nid
+                    , msg.u.dump.pid
+                    , msg.u.dump.verifier
+                    , msg.u.dump.dumper_nid
+                    , msg.u.dump.dumper_pid
+                    , msg.u.dump.dumper_verifier
+                    , msg.u.dump.core_file );
+    }
+
+    int error = SendToMon( "process-dump-complete"
+                         , &msg
+                         , myInfo
+                         , targetNid
+                         , targetNode->GetName() );
+    
+    TRACE_EXIT;
+    return error;
+}
+
 int CPtpClient::ProcessExit( CProcess *process
                            , int targetNid
                            , const char *targetNodeName )
diff --git a/core/sqf/monitor/linux/ptpclient.h b/core/sqf/monitor/linux/ptpclient.h
index 5239c78..95563f8 100644
--- a/core/sqf/monitor/linux/ptpclient.h
+++ b/core/sqf/monitor/linux/ptpclient.h
@@ -47,6 +47,8 @@
                           , int targetNid
                           , const char* targetNodeName );
     int  ProcessClone( CProcess* process );
+    int  ProcessDump( CProcess* process );
+    int  ProcessDumpComplete( CProcess* process );
     int  ProcessExit( CProcess* process
                     , int parentNid
                     , const char* targetNodeName );
diff --git a/core/sqf/monitor/linux/ptpcommaccept.cxx b/core/sqf/monitor/linux/ptpcommaccept.cxx
index 15933dd..0972ec9 100644
--- a/core/sqf/monitor/linux/ptpcommaccept.cxx
+++ b/core/sqf/monitor/linux/ptpcommaccept.cxx
@@ -144,14 +144,64 @@
         {
             switch ( msg.type )
             {
-                case InternalType_UniqStr:
+                case InternalType_Clone:
                 {
                     if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
                     {
-                        trace_printf( "%s@%d" " - Received InternalType_UniqStr\n"
+                        trace_printf( "%s@%d" " - Received InternalType_Clone\n"
                                     , method_name, __LINE__ );
                     }
-                    ReqQueue.enqueueUniqStrReq( &msg.u.uniqstr);
+                    ReqQueue.enqueueCloneReq( &msg.u.clone );
+                    break;
+                }
+                case InternalType_Exit:
+                {
+                    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+                    {
+                        trace_printf( "%s@%d" " - Received InternalType_Exit\n"
+                                    , method_name, __LINE__ );
+                    }
+                    ReqQueue.enqueueExitReq( &msg.u.exit );
+                    break;
+                }
+                case InternalType_IoData:
+                {
+                    if (trace_settings & (TRACE_REDIRECTION | TRACE_PROCESS))
+                    {
+                        trace_printf( "%s@%d" " - Received InternalType_IoData\n"
+                                    , method_name, __LINE__ );
+                    }
+                    ReqQueue.enqueueIoDataReq( &msg.u.iodata );
+                    break;
+                }
+                case InternalType_Kill:
+                {
+                    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+                    {
+                        trace_printf( "%s@%d" " - Received InternalType_Kill\n"
+                                    , method_name, __LINE__ );
+                    }
+                    ReqQueue.enqueueKillReq( &msg.u.kill );
+                    break;
+                }
+                case InternalType_Notify:
+                {
+                    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+                    {
+                        trace_printf( "%s@%d" " - Received InternalType_Notify\n"
+                                    , method_name, __LINE__ );
+                    }
+                    ReqQueue.enqueueNotifyReq( &msg.u.notify );
+                    break;
+                }
+                case InternalType_Open:
+                {
+                    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+                    {
+                        trace_printf( "%s@%d" " - Received InternalType_Open\n"
+                                    , method_name, __LINE__ );
+                    }
+                    ReqQueue.enqueueOpenReq( &msg.u.open );
                     break;
                 }
                 case InternalType_Process:
@@ -181,66 +231,6 @@
                     }
                     break;
                 }
-                case InternalType_Clone:
-                {
-                    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
-                    {
-                        trace_printf( "%s@%d" " - Received InternalType_Clone\n"
-                                    , method_name, __LINE__ );
-                    }
-                    ReqQueue.enqueueCloneReq( &msg.u.clone );
-                    break;
-                }
-                case InternalType_Open:
-                {
-                    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
-                    {
-                        trace_printf( "%s@%d" " - Received InternalType_Open\n"
-                                    , method_name, __LINE__ );
-                    }
-                    ReqQueue.enqueueOpenReq( &msg.u.open );
-                    break;
-                }
-                case InternalType_Notify:
-                {
-                    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
-                    {
-                        trace_printf( "%s@%d" " - Received InternalType_Notify\n"
-                                    , method_name, __LINE__ );
-                    }
-                    ReqQueue.enqueueNotifyReq( &msg.u.notify );
-                    break;
-                }
-                case InternalType_Exit:
-                {
-                    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
-                    {
-                        trace_printf( "%s@%d" " - Received InternalType_Exit\n"
-                                    , method_name, __LINE__ );
-                    }
-                    ReqQueue.enqueueExitReq( &msg.u.exit );
-                    break;
-                }
-                case InternalType_Kill:
-                {
-                    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
-                    {
-                        trace_printf( "%s@%d" " - Received InternalType_Kill\n"
-                                    , method_name, __LINE__ );
-                    }
-                    ReqQueue.enqueueKillReq( &msg.u.kill );
-                    break;
-                }
-                case InternalType_IoData:
-                {
-                    if (trace_settings & (TRACE_REDIRECTION | TRACE_PROCESS))
-                    {
-                        trace_printf( "%s@%d" " - Received InternalType_IoData\n"
-                                    , method_name, __LINE__ );
-                    }
-                    ReqQueue.enqueueIoDataReq( &msg.u.iodata );
-                    break;
-                }
                 case InternalType_StdinReq:
                 {
                     if (trace_settings & (TRACE_REDIRECTION | TRACE_PROCESS))
@@ -251,6 +241,16 @@
                     ReqQueue.enqueueStdInReq( &msg.u.stdin_req );
                     break;
                 }
+                case InternalType_UniqStr:
+                {
+                    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+                    {
+                        trace_printf( "%s@%d" " - Received InternalType_UniqStr\n"
+                                    , method_name, __LINE__ );
+                    }
+                    ReqQueue.enqueueUniqStrReq( &msg.u.uniqstr);
+                    break;
+                }
                 default:
                 {
                     char buf[MON_STRING_BUF_SIZE];
diff --git a/core/sqf/monitor/linux/redirector.cxx b/core/sqf/monitor/linux/redirector.cxx
index 92d74ac..4df2f86 100644
--- a/core/sqf/monitor/linux/redirector.cxx
+++ b/core/sqf/monitor/linux/redirector.cxx
@@ -54,6 +54,7 @@
 #include "pnode.h"
 #include "lock.h"
 #include "mlio.h"
+#include "msgdef.h"
 #include "redirector.h"
 #include "replicate.h"
 #include "monsonar.h"
@@ -73,6 +74,7 @@
 extern CReqQueue ReqQueue;
 extern CPtpClient *PtpClient;
 extern bool NameServerEnabled;
+extern const char *StateString( STATE state);
 #endif
 
 const char *EpollEventString( __uint32_t events )
@@ -905,29 +907,78 @@
     const char method_name[] = "CRedirectStdout::CRedirectStdout";
     TRACE_ENTRY;
 
+    int rc = 0;
+    int err = 0;
+
     // Add eyecatcher sequence as a debugging aid
     memcpy(&eyecatcher_, "REDE", 4);
 
-    fd_ = open(filename, O_CREAT | O_APPEND | O_WRONLY | O_NONBLOCK,
-               S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
-    if( fd_ == -1 )
+    if (trace_settings & TRACE_REDIRECTION)
     {
-        char buf[MON_STRING_BUF_SIZE];
-        sprintf(buf, "[%s], open error for %s, %s.\n", method_name,
-                filename, strerror(errno));
-        mon_log_write(MON_REDIR_STDOUT_1, SQ_LOG_ERR, buf);
+        trace_printf( "%s@%d stdout, file=%s\n"
+                    , method_name, __LINE__
+                    , filename );
     }
 
-    else
-    {
-        // Retain file name.  Might be needed in case of error on file.
-        filename_ = filename;
-
-        Redirector.addToMap(fd_, this);
-        if (trace_settings & TRACE_REDIRECTION)
+    if (strlen(filename))
+    {   // stdout file/device is on this node
+        struct stat statbuf;
+        memset(&statbuf, 0, sizeof(statbuf));
+        rc = stat(filename, &statbuf);
+        if (rc == -1)
         {
-            trace_printf("%s@%d opened %s fd=%d.  Added to fdMap.\n",
-                         method_name, __LINE__, filename, fd_);
+            err = errno;
+        }
+
+        if (rc == 0 || err == ENOENT)
+        {
+            fd_ = open(filename, O_CREAT | O_APPEND | O_WRONLY | O_NONBLOCK,
+                       S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+            if( fd_ == -1 )
+            {
+                err = errno;
+                if ( err == EACCES &&
+                    (S_ISCHR(statbuf.st_mode) || S_ISFIFO(statbuf.st_mode)) )
+                { // Don't log error since it is a common occurrence
+                    if (trace_settings & TRACE_REDIRECTION)
+                    {
+                        trace_printf( "%s@%d stdout is character device or "
+                                      "named pipe, file=%s, errno=%d (%s)\n"
+                                    , method_name, __LINE__
+                                    , filename, err, strerror(err) );
+                    }
+                }
+                else
+                {
+                    char buf[MON_STRING_BUF_SIZE];
+                    sprintf( buf
+                           , "[%s], open error for: "
+                             "file=%s, errno=%d (%s)\n"
+                           , method_name, filename, err, strerror(err) );
+                    mon_log_write(MON_REDIR_STDOUT_1, SQ_LOG_ERR, buf);
+                }
+            }
+            else
+            {
+                // Retain file name.  Might be needed in case of error on file.
+                filename_ = filename;
+    
+                Redirector.addToMap(fd_, this);
+                if (trace_settings & TRACE_REDIRECTION)
+                {
+                    trace_printf("%s@%d opened %s fd=%d.  Added to fdMap.\n",
+                                 method_name, __LINE__, filename, fd_);
+                }
+            }
+        }
+        else
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            sprintf( buf
+                   , "[%s], unable to obtain file info for stdout file"
+                     ", file=%s, errno=%d (%s)\n"
+                   , method_name, filename, err, strerror(err) );
+            mon_log_write(MON_REDIR_STDOUT_2, SQ_LOG_ERR, buf);
         }
     }
 
@@ -1234,7 +1285,7 @@
 
     CProcess *process = NULL;
 
-    if ( !MyNode->IsKillingNode() || MyNode->IsSoftNodeDown() )
+    if ( !MyNode->IsKillingNode() )
     {
         process = MyNode->GetProcess ( pid_ );
     }
@@ -1290,9 +1341,16 @@
             process->SetAbended( true );
         }
         if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION))
-            trace_printf("%s@%d Detected broken stderr pipe for child "
-                         "process, pid=%d; waiting for child death signal\n",
-                         method_name, __LINE__, pid_);
+            trace_printf( "%s@%d Detected broken stderr pipe for child "
+                          "process %s(%d,%d:%d), state=%s, IsAbended=%s; "
+                          "waiting for child death signal\n"
+                        , method_name, __LINE__
+                        , process->GetName()
+                        , process->GetNid()
+                        , process->GetPid()
+                        , process->GetVerifier()
+                        , StateString(process->GetState())
+                        , process->IsAbended()?"TRUE":"FALSE" );
         process->SetHangupTime ();
         MyNode->PidHangupSet ( pid_ );
     }
@@ -1542,17 +1600,20 @@
 
     if (filename[0])
     {   // stdin source file/device is on this node
-
         struct stat statbuf;
         if (stat(filename, &statbuf) == -1)
         {
-            char buf[MON_STRING_BUF_SIZE];
-            sprintf(buf, "[%s], unable to obtain file info for stdin file"
-                    ", file=%s. Closing stdin pipe fd=%d\n",
-                    method_name, filename, pipeFd );
-            mon_log_write(MON_REDIR_STDIN_FD_1, SQ_LOG_DEBUG, buf);
-
-            close ( pipeFd );
+            int err = errno;
+            if (err != ENOENT)
+            {
+                char buf[MON_STRING_BUF_SIZE];
+                sprintf(buf, "[%s], unable to obtain file info for stdin file"
+                        ", file=%s, errno=%d (%s). Closing stdin pipe fd=%d\n",
+                        method_name, filename, err, strerror(err), pipeFd );
+                mon_log_write(MON_REDIR_STDIN_FD_1, SQ_LOG_ERR, buf);
+            }
+            delFromMap( pipeFd );
+            close( pipeFd );
             pipeFd = -1;
         }
         else
@@ -2021,13 +2082,7 @@
                     // bugcatcher, temp call
                     redirect->validateObj();
                 }
-                else
-                {
-                    char buf[MON_STRING_BUF_SIZE];
-                    sprintf(buf, "[%s], fd=%d not found in map\n",
-                            method_name, fd);
-                    mon_log_write(MON_REDIRECT_TH_2, SQ_LOG_WARNING, buf);
-                }
+                // else fd was already deleted
 
                 if (events & (EPOLLIN | EPOLLPRI))
                 {
@@ -2106,17 +2161,7 @@
                                 delete redirect;
                         }
                     }
-                    else
-                    {   // Unexpected state
-                        char buf[MON_STRING_BUF_SIZE];
-                        sprintf(buf, "[%s], unexpected redirect object state "
-                                "at hangup, fd=%d, events=%d, redirect=%p, "
-                                "pid_=%d, fd_=%d\n",
-                                method_name, fd, events, redirect,
-                                (redirect == NULL ? 0 : redirect->pid()),
-                                (redirect == NULL ? 0 : redirect->fd()));
-                        mon_log_write(MON_REDIRECT_TH_4, SQ_LOG_ERR, buf);
-                    }
+                    // else process object was already deleted by child death processing
                 }
                 else if (events & EPOLLERR)
                 {
diff --git a/core/sqf/monitor/linux/replicate.cxx b/core/sqf/monitor/linux/replicate.cxx
index 15e0394..5b80df4 100644
--- a/core/sqf/monitor/linux/replicate.cxx
+++ b/core/sqf/monitor/linux/replicate.cxx
@@ -68,9 +68,7 @@
     }
 }
 
-#ifdef NAMESERVER_PROCESS
 struct dummy_sizeof_def {};
-#endif
 #ifndef EXCHANGE_CPU_SCHEDULING_DATA
 struct dummy1_sizeof_def {};
 #endif
@@ -78,53 +76,57 @@
 // Determine the maximum size of a replication object (excluding CReplEvent)
 int CReplObj::calcAllocSize()
 {
-    return  max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(sizeof(CReplNameServerAdd),
-                                                                                                sizeof(CReplNodeName)),
-                                                                                            sizeof(CReplNodeAdd)),
-                                                                                        sizeof(CReplNodeDelete)),
-                                                                                    sizeof(CReplSoftNodeUp)),
-                                                                                sizeof(CReplSoftNodeDown)),
-#ifdef EXCHANGE_CPU_SCHEDULING_DATA
-                                                                            sizeof(CReplSchedData)),
-#else
-                                                                            sizeof(dummy1_sizeof_def)),
-#endif
-                                                                        sizeof(CReplActivateSpare)),
-                                                                    sizeof(CReplConfigData)),
-                                                                sizeof(CReplOpen)),
-                                                            sizeof(CReplProcInit)),
-                                                        sizeof(CReplProcess)),
-                                                    sizeof(CReplClone)),
-                                                sizeof(CReplExit)),
-                                            sizeof(CReplKill)),
 #ifdef NAMESERVER_PROCESS
-                                        sizeof(CReplExitNs)),
+    return          max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(sizeof(CReplNameServerAdd),
 #else
-                                        sizeof(CReplDevice)),
+    return          max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(max(sizeof(CReplNameServerAdd),
 #endif
-                                    sizeof(CReplNodeDown)),
-                                sizeof(CReplNodeUp)),
+                                                                                                        sizeof(CReplNodeName)),
+                                                                                                    sizeof(CReplNodeAdd)),
+                                                                                                sizeof(CReplNodeDelete)),
+                                                                                            sizeof(dummy_sizeof_def)),
+                                                                                        sizeof(dummy_sizeof_def)),
+#ifdef EXCHANGE_CPU_SCHEDULING_DATA
+                                                                                    sizeof(CReplSchedData)),
+#else
+                                                                                    sizeof(dummy1_sizeof_def)),
+#endif
+                                                                                sizeof(CReplActivateSpare)),
+                                                                            sizeof(CReplConfigData)),
+                                                                        sizeof(CReplOpen)),
+                                                                    sizeof(CReplProcInit)),
+                                                                sizeof(CReplProcess)),
+                                                            sizeof(CReplClone)),
+                                                        sizeof(CReplExit)),
+                                                    sizeof(CReplKill)),
+#ifdef NAMESERVER_PROCESS
+                                                sizeof(CReplExitNs)),
+#else
+                                                sizeof(CReplDevice)),
+#endif
+                                            sizeof(CReplNodeDown)),
+                                        sizeof(CReplNodeUp)),
+#ifdef NAMESERVER_PROCESS
+                                    sizeof(dummy_sizeof_def)),
+#else
+                                    sizeof(CReplDump)),
+#endif
+#ifdef NAMESERVER_PROCESS
+                                sizeof(dummy_sizeof_def)),
+#else
+                                sizeof(CReplDumpComplete)),
+#endif
 #ifdef NAMESERVER_PROCESS
                             sizeof(dummy_sizeof_def)),
 #else
-                            sizeof(CReplDump)),
+                            sizeof(CReplStdioData)),
 #endif
 #ifdef NAMESERVER_PROCESS
                         sizeof(dummy_sizeof_def)),
 #else
-                        sizeof(CReplDumpComplete)),
+                        sizeof(CReplStdinReq)),
 #endif
-#ifdef NAMESERVER_PROCESS
-                    sizeof(dummy_sizeof_def)),
-#else
-                    sizeof(CReplStdioData)),
-#endif
-#ifdef NAMESERVER_PROCESS
-                sizeof(dummy_sizeof_def)),
-#else
-                sizeof(CReplStdinReq)),
-#endif
-            sizeof(CReplShutdown));
+                    sizeof(CReplShutdown));
 }
 
 void * CReplObj::operator new(size_t ) throw()
@@ -1268,11 +1270,15 @@
     if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL))
     {
         const char method_name[] = "CReplDump::CReplDump";
-        trace_printf("%s@%d" " - Queuing dump pending (%d, %d) "
-                     "dumper (%d, %d)\n",
-                     method_name, __LINE__,
-                     process_->GetNid(), process_->GetPid(),
-                     process_->GetDumperNid(), process_->GetDumperPid());
+        trace_printf( "%s@%d" " - Queuing dump pending (%d,%d:%d) "
+                      "dumper (%d,%d:%d)\n"
+                    , method_name, __LINE__
+                    , process_->GetNid()
+                    , process_->GetPid()
+                    , process_->GetVerifier()
+                    , process_->GetDumperNid()
+                    , process_->GetDumperPid()
+                    , process_->GetDumperVerifier() );
     }
 
     // Increment reference count for process object
@@ -1304,7 +1310,7 @@
 
     if (trace_settings & (TRACE_SYNC | TRACE_PROCESS))
     {
-        trace_printf("%s@%d" " - Replicating dump pending (%d, %d) "
+        trace_printf("%s@%d" " - Replicating dump pending target (%d, %d) "
                      "dumper (%d, %d)\n",
                      method_name, __LINE__,
                      process_->GetNid(), process_->GetPid(),
@@ -1343,11 +1349,15 @@
     if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL))
     {
         const char method_name[] = "CReplDumpComplete::CReplDumpComplete";
-        trace_printf("%s@%d  - Queuing dump complete (%d, %d:%d) dumper "
-                     "(%d, %d)\n", method_name, __LINE__,
-                     process_->GetNid(), process_->GetPid(),
-                     process_->GetDumperNid(), process_->GetDumperPid(),
-                     process_->GetDumperVerifier());
+        trace_printf( "%s@%d  - Queuing dump complete target (%d,%d:%d) "
+                      "dumper (%d,%d:%d)\n"
+                    , method_name, __LINE__
+                    , process_->GetNid()
+                    , process_->GetPid()
+                    , process_->GetVerifier()
+                    , process_->GetDumperNid()
+                    , process_->GetDumperPid()
+                    , process_->GetDumperVerifier());
     }
 
     // Increment reference count for process object
@@ -1379,13 +1389,15 @@
 
     if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
     {
-        trace_printf("%s@%d" " - Replicating dump complete (%d, %d:%d) "
-                     "dumper (%d, %d:%d)\n",
-                     method_name, __LINE__,
-                     process_->GetNid(), process_->GetPid(),
-                     process_->GetVerifier(),
-                     process_->GetDumperNid(), process_->GetDumperPid(),
-                     process_->GetDumperVerifier());
+        trace_printf( "%s@%d" " - Replicating dump complete target (%d,%d:%d) "
+                      "dumper (%d,%d:%d)\n"
+                    , method_name, __LINE__
+                    , process_->GetNid()
+                    , process_->GetPid()
+                    , process_->GetVerifier()
+                    , process_->GetDumperNid()
+                    , process_->GetDumperPid()
+                    , process_->GetDumperVerifier() );
     }
 
     msg->type = InternalType_DumpComplete;
@@ -2034,104 +2046,6 @@
     return true;
 }
 
-CReplSoftNodeDown::CReplSoftNodeDown(int pnid) : pnid_(pnid)
-{
-    // Add eyecatcher sequence as a debugging aid
-    memcpy(&eyecatcher_, "RPLX", 4);
-
-    // Compute message size (adjust if needed to conform to
-    // internal_msg_def structure alignment).
-    replSize_ = (MSG_HDR_SIZE + sizeof ( down_def ) + msgAlignment_)
-                & ~msgAlignment_;
-
-    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL))
-    {
-        const char method_name[] = "CReplSoftNodeDown::CReplSoftNodeDown";
-        trace_printf("%s@%d  - Queuing soft node down, pnid=%d\n",
-                     method_name, __LINE__, pnid_);
-    }
-}
-
-CReplSoftNodeDown::~CReplSoftNodeDown()
-{
-    const char method_name[] = "CReplSoftNodeDown::~CReplSoftNodeDown";
-
-    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL))
-        trace_printf("%s@%d - Soft node down replication for pnid=%d\n", method_name, __LINE__, pnid_ );
-
-    // Alter eyecatcher sequence as a debugging aid to identify deleted object
-    memcpy(&eyecatcher_, "rplx", 4);
-}
-
-bool CReplSoftNodeDown::replicate(struct internal_msg_def *&msg)
-{
-    const char method_name[] = "CReplSoftNodeDown::replicate";
-    TRACE_ENTRY;
-
-    if (trace_settings & (TRACE_SYNC | TRACE_PROCESS))
-        trace_printf("%s@%d" " - Replicating soft node down, pnid=%d\n", method_name, __LINE__, pnid_);
-
-    // build message to replicate this soft node down to other nodes
-    msg->type = InternalType_SoftNodeDown;
-    msg->u.down.pnid = pnid_;
-
-    // Advance sync buffer pointer
-    Nodes->AddMsg( msg, replSize() );
-
-    TRACE_EXIT;
-
-    return true;
-}
-
-CReplSoftNodeUp::CReplSoftNodeUp(int pnid) : pnid_(pnid)
-{
-    // Add eyecatcher sequence as a debugging aid
-    memcpy(&eyecatcher_, "RPLY", 4);
-
-    // Compute message size (adjust if needed to conform to
-    // internal_msg_def structure alignment).
-    replSize_ = (MSG_HDR_SIZE + sizeof ( down_def ) + msgAlignment_)
-                & ~msgAlignment_;
-
-    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL))
-    {
-        const char method_name[] = "CReplSoftNodeUp::CReplSoftNodeUp";
-        trace_printf("%s@%d  - Queuing soft node up, pnid=%d\n",
-                     method_name, __LINE__, pnid_);
-    }
-}
-
-CReplSoftNodeUp::~CReplSoftNodeUp()
-{
-    const char method_name[] = "CReplSoftNodeUp::~CReplSoftNodeUp";
-
-    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL))
-        trace_printf("%s@%d - Soft node up replication for pnid=%d\n", method_name, __LINE__, pnid_ );
-
-    // Alter eyecatcher sequence as a debugging aid to identify deleted object
-    memcpy(&eyecatcher_, "rply", 4);
-}
-
-bool CReplSoftNodeUp::replicate(struct internal_msg_def *&msg)
-{
-    const char method_name[] = "CReplSoftNodeUp::replicate";
-    TRACE_ENTRY;
-
-    if (trace_settings & (TRACE_SYNC | TRACE_PROCESS))
-        trace_printf("%s@%d" " - Replicating soft node up, pnid=%d\n", method_name, __LINE__, pnid_);
-
-    // build message to replicate this soft node up to other nodes
-    msg->type = InternalType_SoftNodeUp;
-    msg->u.down.pnid = pnid_;
-
-    // Advance sync buffer pointer
-    Nodes->AddMsg( msg, replSize() );
-
-    TRACE_EXIT;
-
-    return true;
-}
-
 #ifndef NAMESERVER_PROCESS
 CReplStdioData::CReplStdioData(int nid, int pid, StdIoType type, ssize_t count,
                                char *data)
diff --git a/core/sqf/monitor/linux/replicate.h b/core/sqf/monitor/linux/replicate.h
index 0526159..55c078f 100644
--- a/core/sqf/monitor/linux/replicate.h
+++ b/core/sqf/monitor/linux/replicate.h
@@ -395,7 +395,6 @@
     CProcess *process_;
 };
 
-
 class CReplNodeUp: public CReplObj
 {
 public:
@@ -408,32 +407,6 @@
     int pnid_;
 };
 
-
-class CReplSoftNodeDown: public CReplObj
-{
-public:
-    CReplSoftNodeDown(int pnid);
-    virtual ~CReplSoftNodeDown();
-
-    bool replicate(struct internal_msg_def *& msg);
-
-private:
-    int pnid_;
-};
-
-class CReplSoftNodeUp: public CReplObj
-{
-public:
-    CReplSoftNodeUp(int pnid);
-    virtual ~CReplSoftNodeUp();
-
-    bool replicate(struct internal_msg_def *& msg);
-
-private:
-    int pnid_;
-};
-
-
 #ifdef EXCHANGE_CPU_SCHEDULING_DATA
 class CReplSchedData: public CReplObj
 {
diff --git a/core/sqf/monitor/linux/reqget.cxx b/core/sqf/monitor/linux/reqget.cxx
index d53fdf5..89886ea 100644
--- a/core/sqf/monitor/linux/reqget.cxx
+++ b/core/sqf/monitor/linux/reqget.cxx
@@ -213,12 +213,6 @@
         }
         else
         {
-            char buf[MON_STRING_BUF_SIZE];
-            sprintf( buf, "%s@%d - Can't find group(%s).\n"
-                   , method_name, __LINE__
-                   , msg_->u.request.u.get.group);
-            mon_log_write(MON_MONITOR_GETCONF_3, SQ_LOG_INFO, buf);
-             
             msg_->u.reply.type = ReplyType_Get;
             msg_->u.reply.u.get.type = ConfigType_Undefined;
             msg_->u.reply.u.get.group[0] = '\0';
diff --git a/core/sqf/monitor/linux/reqinstanceid.cxx b/core/sqf/monitor/linux/reqinstanceid.cxx
new file mode 100644
index 0000000..8c531df
--- /dev/null
+++ b/core/sqf/monitor/linux/reqinstanceid.cxx
@@ -0,0 +1,121 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+// @@@ START COPYRIGHT @@@
+//
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// @@@ END COPYRIGHT @@@
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include "reqqueue.h"
+#include "montrace.h"
+#include "monsonar.h"
+#include "monlogging.h"
+
+extern CNode* MyNode;
+extern int    ClusterId ;
+extern int    InstanceId;
+
+CExtInstanceIdReq::CExtInstanceIdReq( reqQueueMsg_t msgType
+                                    , int pid
+                                    , struct message_def *msg )
+    : CExternalReq(msgType, pid, msg)
+{
+    // Add eyecatcher sequence as a debugging aid
+    memcpy(&eyecatcher_, "RQEV", 4);
+}
+
+CExtInstanceIdReq::~CExtInstanceIdReq()
+{
+    // Alter eyecatcher sequence as a debugging aid to identify deleted object
+    memcpy(&eyecatcher_, "rqev", 4);
+}
+
+void CExtInstanceIdReq::populateRequestString( void )
+{
+    char strBuf[MON_STRING_BUF_SIZE/2] = { 0 };
+
+    snprintf( strBuf, sizeof(strBuf), 
+              "ExtReq(%s) req #=%ld "
+              "requester(nid=%d/pid=%d/verifier=%d) "
+            , CReqQueue::svcReqType[reqType_], getId()
+            , msg_->u.request.u.instance_id.nid
+            , msg_->u.request.u.instance_id.pid
+            , msg_->u.request.u.instance_id.verifier );
+    requestString_.assign( strBuf );
+}
+
+void CExtInstanceIdReq::performRequest()
+{
+    const char method_name[] = "CExtInstanceIdReq::performRequest";
+    TRACE_ENTRY;
+
+    // Trace info about request
+    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+    {
+        trace_printf( "%s@%d request #%ld: Get, requester (%d,%d:%d)\n"
+                    , method_name, __LINE__, id_
+                    , msg_->u.request.u.instance_id.nid
+                    , msg_->u.request.u.instance_id.pid
+                    , msg_->u.request.u.instance_id.verifier );
+    }
+
+    CProcess *requester;
+
+    nid_ = msg_->u.request.u.instance_id.nid;
+    verifier_ = msg_->u.request.u.instance_id.verifier;
+
+    requester = MyNode->GetProcess( pid_
+                                  , verifier_ );
+    if ( requester )
+    {
+        // Process the request
+        msg_->u.reply.type = ReplyType_InstanceId;
+        msg_->u.reply.u.instance_id.cluster_id  = ClusterId;
+        msg_->u.reply.u.instance_id.instance_id = InstanceId;
+        if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+        {
+            trace_printf( "%s@%d - cluster_id=%d, instance_id=%d\n"
+                        , method_name, __LINE__
+                        , msg_->u.reply.u.instance_id.cluster_id
+                        , msg_->u.reply.u.instance_id.instance_id );
+        }
+    }
+    else
+    {
+        // the requester already exited and the LIO buffer will be cleaned up
+        msg_->u.reply.type = ReplyType_Generic;
+        msg_->u.reply.u.generic.nid = -1;
+        msg_->u.reply.u.generic.pid = -1;
+        msg_->u.reply.u.generic.verifier = -1;
+        msg_->u.reply.u.generic.process_name[0] = '\0';
+        msg_->u.reply.u.generic.return_code = MPI_ERR_NAME;
+        if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+        {
+            trace_printf( "%s@%d - requester process not found!\n"
+                        , method_name, __LINE__);
+        }
+    }
+
+    // Send reply to requester
+    lioreply(msg_, pid_);
+
+    TRACE_EXIT;
+}
diff --git a/core/sqf/monitor/linux/reqnodeadd.cxx b/core/sqf/monitor/linux/reqnodeadd.cxx
index e82dcf4..6d92e00 100644
--- a/core/sqf/monitor/linux/reqnodeadd.cxx
+++ b/core/sqf/monitor/linux/reqnodeadd.cxx
@@ -82,7 +82,7 @@
     CProcess       *requester = NULL;
 
     // Trace info about request
-    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+    if (trace_settings & (TRACE_REQUEST | TRACE_REQUEST_DETAIL))
     {
         trace_printf("%s@%d request #%ld: NodeAdd, "
                      "node_name=%s, first_core=%d, last_core=%d, "
diff --git a/core/sqf/monitor/linux/reqprocinfo.cxx b/core/sqf/monitor/linux/reqprocinfo.cxx
index c49a877..06bce0d 100644
--- a/core/sqf/monitor/linux/reqprocinfo.cxx
+++ b/core/sqf/monitor/linux/reqprocinfo.cxx
@@ -41,6 +41,8 @@
 extern CNameServer *NameServer;
 #endif
 
+extern const char *ProcessTypeString( PROCESSTYPE type );
+
 // Copy information for a specific process into the reply message buffer.
 void CExtProcInfoBase::ProcessInfo_CopyData(CProcess *process, ProcessInfoState &procState)
 {
@@ -179,6 +181,10 @@
 // and the caller will need the new value.
 CProcess * CExtProcInfoBase::ProcessInfo_GetProcess (int &nid, bool getDataForAllNodes)
 {
+#ifndef NAMESERVER_PROCESS 
+    const char method_name[] = "CExtProcInfoBase::ProcessInfo_GetProcess";
+#endif
+
     CProcess * process;
     CLNode *lnode = NULL;
 
@@ -200,6 +206,17 @@
                 process = lnode->GetFirstProcess();
                 if (process != 0)
                 {
+                    if (trace_settings & TRACE_PROCESS_DETAIL)
+                    {
+                        trace_printf( "%s@%d allNodes=%d, nid=%d, process: %s (%d,%d:%d)\n"
+                                    , method_name, __LINE__
+                                    , getDataForAllNodes
+                                    , nid
+                                    , process->GetName()
+                                    , process->GetNid()
+                                    , process->GetPid()
+                                    , process->GetVerifier() );
+                    }
                     return process;
                 }
             }
@@ -221,6 +238,8 @@
                                      bool getDataForAllNodes,
                                      char *pattern)
 {
+    const char method_name[] = "CExtProcInfoBase::ProcessInfo_BuildReply";
+
     int currentIndex = (process != 0) 
             ? Nodes->GetNidIndex( process->GetNid() )
             : Nodes->GetLNodesCount();
@@ -251,6 +270,18 @@
         // Retrieve process data for processes on current node
         while ( process )
         {
+            if (trace_settings & TRACE_PROCESS_DETAIL)
+            {
+                trace_printf( "%s@%d allNodes=%d, pattern=%s, type=%s, process: %s (%d,%d:%d)\n"
+                            , method_name, __LINE__
+                            , getDataForAllNodes
+                            , (strlen( pattern ))?process_pattern: ""
+                            , ProcessTypeString(type)
+                            , process->GetName()
+                            , process->GetNid()
+                            , process->GetPid()
+                            , process->GetVerifier() );
+            }
             if (type == ProcessType_Undefined || type == process->GetType())
             {
                 if (reg)
@@ -289,9 +320,33 @@
             // to be the node index number where the process resides.
 
             int nid = Nodes->GetNidByMap( currentIndex );
+            if (trace_settings & TRACE_PROCESS_DETAIL)
+            {
+                trace_printf( "%s@%d moreToRetrieve=%d, nid=%d\n"
+                            , method_name, __LINE__
+                            , moreToRetrieve
+                            , nid );
+            }
             if (nid == -1) break;
+            if (trace_settings & TRACE_PROCESS_DETAIL)
+            {
+                trace_printf( "%s@%d allNodes=%d, nid=%d\n"
+                            , method_name, __LINE__
+                            , getDataForAllNodes
+                            , nid );
+            }
             process = ProcessInfo_GetProcess( nid, getDataForAllNodes);
             currentIndex = Nodes->GetNidIndex( nid );
+            if (process && trace_settings & TRACE_PROCESS_DETAIL)
+            {
+                trace_printf( "%s@%d currentIndex=%d, next process: %s (%d,%d:%d)\n"
+                            , method_name, __LINE__
+                            , currentIndex
+                            , process->GetName()
+                            , process->GetNid()
+                            , process->GetPid()
+                            , process->GetVerifier() );
+            }
             moreToRetrieve = true;
         }
     } while (moreToRetrieve);
diff --git a/core/sqf/monitor/linux/reqqueue.cxx b/core/sqf/monitor/linux/reqqueue.cxx
index 3d425f2..0860c4d 100644
--- a/core/sqf/monitor/linux/reqqueue.cxx
+++ b/core/sqf/monitor/linux/reqqueue.cxx
@@ -44,6 +44,7 @@
 #include "redirector.h"
 #include "nameserver.h"
 #include "ptpclient.h"
+#include "zclient.h"
 #endif
 
 extern int MyPNID;
@@ -67,6 +68,7 @@
 extern CProcess *NameServerProcess;
 extern CNameServer *NameServer;
 extern CNameServerConfigContainer *NameServerConfig;
+extern CZClient* ZClient;
 #endif
 
 extern int req_type_startup;
@@ -1042,6 +1044,362 @@
 #endif
 
 #ifndef NAMESERVER_PROCESS
+CIntDumpCompleteReq::CIntDumpCompleteReq()
+           : CInternalReq()
+           , nid_(0)
+           , pid_(0)
+           , verifier_(-1)
+           , dumperNid_(0)
+           , dumperPid_(0)
+           , dumperVerifier_(-1)
+{
+    // Add eyecatcher sequence as a debugging aid
+    memcpy(&eyecatcher_, "RqIC", 4);
+
+    coreFile_[0] = '\0';
+}
+
+CIntDumpCompleteReq::~CIntDumpCompleteReq()
+{
+    // Alter eyecatcher sequence as a debugging aid to identify deleted object
+    memcpy(&eyecatcher_, "rQic", 4);
+}
+
+void CIntDumpCompleteReq::populateRequestString( void )
+{
+    char strBuf[MON_STRING_BUF_SIZE/2];
+    sprintf( strBuf, "IntReq(%s) req #=%ld (nid=%d/pid=%d/verifier=%d)"
+                   , CReqQueue::intReqType[InternalType_DumpComplete]
+                   , getId(), nid_, pid_, verifier_ );
+    requestString_.assign( strBuf );
+}
+
+void CIntDumpCompleteReq::prepRequest( struct dump_def *dumpDef )
+{
+    const char method_name[] = "CIntDumpCompleteReq::prepRequest";
+    TRACE_ENTRY;
+
+    nid_ = dumpDef->nid;
+    pid_ = dumpDef->pid;
+    verifier_ = dumpDef->verifier;
+    dumperNid_ = dumpDef->dumper_nid;
+    dumperPid_ = dumpDef->dumper_pid;
+    dumperVerifier_ = dumpDef->dumper_verifier;
+    strcpy( coreFile_, dumpDef->core_file );
+    status_ = dumpDef->status;
+
+    TRACE_EXIT;
+}
+
+void CIntDumpCompleteReq::performRequest()
+{
+    const char method_name[] = "CIntDumpCompleteReq::performRequest";
+    TRACE_ENTRY;
+
+    CProcess* process = NULL;
+    CLNode*   lnode;
+
+    lnode = Nodes->GetLNode( nid_ );
+    if ( lnode )
+    {
+        process = lnode->GetProcessL( pid_ );
+
+        if (process)
+        {
+            int verifier = verifier_;
+            if ( (verifier == -1) || (verifier == process->GetVerifier()) )
+            {
+                process->DumpEnd( status_, coreFile_ );
+            }
+            else
+            {
+                char buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf)
+                        , "[%s], Can't find process nid=%d, "
+                          "pid=%d, verifier=%d for dump complete target.\n"
+                        , method_name
+                        , nid_
+                        , pid_
+                        , verifier_ );
+                mon_log_write(MON_INTREQ_DUMPCOMPLETE_1, SQ_LOG_ERR, buf);
+            }
+        }
+        else
+        {
+            // Dump completion handled in CProcess::Exit()
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s], Can't find process nid=%d, "
+                      "pid=%d for dump complete target.\n"
+                    , method_name
+                    , nid_
+                    , pid_ );
+            mon_log_write(MON_INTREQ_DUMPCOMPLETE_2, SQ_LOG_ERR, buf);
+        }
+    }
+
+    TRACE_EXIT;
+}
+#endif
+
+#ifndef NAMESERVER_PROCESS
+CIntDumpReq::CIntDumpReq()
+           : CInternalReq()
+           , nid_(0)
+           , pid_(0)
+           , verifier_(-1)
+           , dumperNid_(0)
+           , dumperPid_(0)
+           , dumperVerifier_(-1)
+{
+    // Add eyecatcher sequence as a debugging aid
+    memcpy(&eyecatcher_, "RqID", 4);
+
+    coreFile_[0] = '\0';
+}
+
+CIntDumpReq::~CIntDumpReq()
+{
+    // Alter eyecatcher sequence as a debugging aid to identify deleted object
+    memcpy(&eyecatcher_, "rQid", 4);
+}
+
+void CIntDumpReq::populateRequestString( void )
+{
+    char strBuf[MON_STRING_BUF_SIZE/2];
+    sprintf( strBuf, "IntReq(%s) req #=%ld (nid=%d/pid=%d/verifier=%d)"
+                   , CReqQueue::intReqType[InternalType_Dump]
+                   , getId(), nid_, pid_, verifier_ );
+    requestString_.assign( strBuf );
+}
+
+void CIntDumpReq::prepRequest( struct dump_def *dumpDef )
+{
+    const char method_name[] = "CIntDumpReq::prepRequest";
+    TRACE_ENTRY;
+
+    nid_ = dumpDef->nid;
+    pid_ = dumpDef->pid;
+    verifier_ = dumpDef->verifier;
+    dumperNid_ = dumpDef->dumper_nid;
+    dumperPid_ = dumpDef->dumper_pid;
+    dumperVerifier_ = dumpDef->dumper_verifier;
+    strcpy( coreFile_, dumpDef->core_file );
+
+    TRACE_EXIT;
+}
+
+void CIntDumpReq::performRequest()
+{
+    const char method_name[] = "CIntDumpReq::performRequest";
+    TRACE_ENTRY;
+
+    CProcess* process = NULL;
+    CLNode*   lnode;
+
+    lnode = Nodes->GetLNode( nid_ );
+    if ( lnode )
+    {
+        process = lnode->GetProcessL( pid_ );
+
+        if (process)
+        {
+            int verifier = verifier_;
+            if ( (verifier == -1) || (verifier == process->GetVerifier()) )
+            {
+                process->DumpBegin( dumperNid_
+                                  , dumperPid_
+                                  , dumperVerifier_
+                                  , coreFile_ );
+            }
+            else
+            {
+                char buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf)
+                        , "[%s], Can't find process nid=%d, "
+                          "pid=%d, verifier=%d for dump target.\n"
+                        , method_name
+                        , nid_
+                        , pid_
+                        , verifier_ );
+                mon_log_write(MON_INTREQ_DUMP_1, SQ_LOG_ERR, buf);
+            }
+        }
+        else
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s], Can't find process nid=%d, "
+                      "pid=%d for dump target.\n"
+                    , method_name
+                    , nid_
+                    , pid_ );
+            mon_log_write(MON_INTREQ_DUMP_2, SQ_LOG_ERR, buf);
+        }
+    }
+
+    TRACE_EXIT;
+}
+#endif
+
+#ifndef NAMESERVER_PROCESS
+CIntEventReq::CIntEventReq()
+            : CInternalReq()
+            , eventId_(0)
+            , length_(0)
+            , targetNid_(-1)
+            , targetPid_(-1)
+            , targetVerifier_(-1)
+            , bigData_(NULL)
+{
+    // Add eyecatcher sequence as a debugging aid
+    memcpy(&eyecatcher_, "RqIE", 4);
+
+    data_[0] = '\0';
+}
+
+CIntEventReq::~CIntEventReq()
+{
+    if (length_ > SMALL_DATA_SIZE)
+    {
+        ::delete[] bigData_;
+    }
+
+    // Alter eyecatcher sequence as a debugging aid to identify deleted object
+    memcpy(&eyecatcher_, "rQiE", 4);
+}
+
+void * CIntEventReq::operator new(size_t size)
+{
+    if (trace_settings & TRACE_SYNC)
+    {
+        const char method_name[] = "CIntEventReq::operator new";
+        trace_printf("%s@%d  - Allocating %d bytes\n",
+                     method_name, __LINE__, (int) size);
+    }
+
+    void * p = ::new char[size];
+
+    return p;
+}
+
+void CIntEventReq::operator delete(void *deadObject, size_t size)
+{
+    if (trace_settings & TRACE_SYNC)
+    {
+        const char method_name[] = "CIntEventReq::operator delete";
+        trace_printf("%s@%d  - Deleting %d bytes\n",
+                     method_name, __LINE__, (int) size);
+    }
+
+    ::delete [] ((char *) deadObject);
+}
+
+void CIntEventReq::populateRequestString( void )
+{
+    char strBuf[MON_STRING_BUF_SIZE/2];
+    sprintf( strBuf
+           , "IntReq(%s) req #=%ld eventId=%d, target(nid=%d/pid=%d/verifier=%d), "
+             "data length=%d"
+           , CReqQueue::intReqType[InternalType_Event]
+           , getId(), eventId_, targetNid_, targetPid_, targetVerifier_, length_ );
+    requestString_.assign( strBuf );
+}
+
+void CIntEventReq::prepRequest( struct event_def *eventDef )
+{
+    const char method_name[] = "CIntEventReq::prepRequest";
+    TRACE_ENTRY;
+
+    eventId_ = eventDef->event_id;
+    length_ = eventDef->length;
+    targetNid_ = eventDef->nid;
+    targetPid_ = eventDef->pid;
+    targetVerifier_ = eventDef->verifier;
+
+    if (length_ <= SMALL_DATA_SIZE)
+    {
+        memmove(data_, &eventDef->data, length_);
+    }
+    else
+    {
+        bigData_ = ::new char[length_];
+        memmove(bigData_, &eventDef->data, length_);
+    }
+
+    TRACE_EXIT;
+}
+
+void CIntEventReq::performRequest()
+{
+    const char method_name[] = "CIntEventReq::performRequest";
+    TRACE_ENTRY;
+
+    CProcess * process = NULL;
+    CLNode *   lnode;
+    char *     eventData = NULL;
+
+    if ( MyNode->IsMyNode( eventId_ ) )
+    {
+        if (trace_settings & TRACE_SYNC)
+        {
+            trace_printf( "%s@%d - processing event %d for (%d, %d:%d), data length=%d\n"
+                        , method_name, __LINE__
+                        , eventId_
+                        , targetNid_
+                        , targetPid_
+                        , targetVerifier_
+                        , length_ );
+        }
+
+        lnode = Nodes->GetLNode( targetNid_ );
+        if (lnode)
+        {
+            process = lnode->GetProcessL( targetPid_ );
+
+            if (process)
+            {
+                int verifier = targetVerifier_;
+                if ( (verifier == -1) || (verifier == process->GetVerifier()) )
+                {
+                    eventData = (length_ <= SMALL_DATA_SIZE)?data_:bigData_;
+                    process->GenerateEvent( eventId_
+                                          , length_
+                                          , eventData);
+                }
+                else
+                {
+                    char buf[MON_STRING_BUF_SIZE];
+                    snprintf( buf, sizeof(buf)
+                            , "[%s], Can't find process nid=%d, "
+                              "pid=%d, verifier=%d for event=%d\n"
+                            , method_name
+                            , targetNid_
+                            , targetPid_
+                            , targetVerifier_
+                            , length_ );
+                    mon_log_write(MON_INTREQ_EVENT_1, SQ_LOG_ERR, buf);
+                }
+            }
+            else
+            {
+                char buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf)
+                        , "[%s], Can't find process nid=%d, "
+                          "pid=%d for processing event.\n"
+                        , method_name
+                        , targetNid_
+                        , targetPid_ );
+                mon_log_write(MON_INTREQ_EVENT_2, SQ_LOG_ERR, buf);
+            }
+        }
+    }
+
+    TRACE_EXIT;
+}
+#endif
+
+#ifndef NAMESERVER_PROCESS
 CIntExitReq::CIntExitReq( )
             : CInternalReq()
             , nid_(0)
@@ -1089,6 +1447,7 @@
     const char method_name[] = "CIntExitReq::performRequest";
     TRACE_ENTRY;
 
+    bool displayErrorMsg = false;
     CProcess *process = NULL;
     CLNode  *lnode;
 
@@ -1131,10 +1490,17 @@
     }
     else
     {
-        char buf[MON_STRING_BUF_SIZE];
-        sprintf(buf, "[%s], Can't find process %s (%d, %d) for processing "
-                "exit.\n", method_name, name_, nid_, pid_);
-        mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_5, SQ_LOG_ERR, buf);
+        if (lnode)
+        {
+            displayErrorMsg = true;
+        }
+        if (displayErrorMsg)
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            sprintf(buf, "[%s], Can't find process %s (%d, %d) for processing "
+                    "exit.\n", method_name, name_, nid_, pid_);
+            mon_log_write(MON_INTREQ_EXIT_1, SQ_LOG_ERR, buf); 
+        }
     }
 
     TRACE_EXIT;
@@ -2213,7 +2579,7 @@
         char buf[MON_STRING_BUF_SIZE];
         sprintf(buf, "[%s], Can't find process nid=%d, pid=%d for "
                 "processing open.\n", method_name, openedNid_, openedPid_ );
-        mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_11, SQ_LOG_ERR, buf);
+        mon_log_write(MON_INTREQ_OPEN_1, SQ_LOG_ERR, buf); 
     }
 
     TRACE_EXIT;
@@ -2877,7 +3243,6 @@
     TRACE_EXIT;
 }
 
-
 CIntNodeNameReq::CIntNodeNameReq( int req_nid
                                 , int req_pid
                                 , Verifier_t req_verifier
@@ -2915,13 +3280,14 @@
     TRACE_ENTRY;
 
     int rc = MPI_SUCCESS;
-    char current_n[MPI_MAX_PROCESSOR_NAME];
-    char new_n[MPI_MAX_PROCESSOR_NAME];
+    char current_n[TC_PROCESSOR_NAME_MAX];
+    char new_n[TC_PROCESSOR_NAME_MAX];
+    char new_domain[TC_PROCESSOR_NAME_MAX];
     CPNodeConfig   *pnodeConfig = NULL;
     CProcess *requester = NULL;
 
-    strcpy (current_n, current_name_.c_str());
-    strcpy (new_n, new_name_.c_str());
+    strcpy( current_n, current_name_.c_str() );
+    strcpy( new_n, new_name_.c_str() );
 
     CClusterConfig *clusterConfig = Nodes->GetClusterConfig();
     if (clusterConfig)
@@ -2930,11 +3296,55 @@
         pnodeConfig = clusterConfig->GetPNodeConfig( current_n );
         if (pnodeConfig)
         {
+            char short_node_name[TC_PROCESSOR_NAME_MAX];
+            char str1[TC_PROCESSOR_NAME_MAX];
+            char *tmpptr = NULL;
+
+            tmpptr = new_n;
+            while ( *tmpptr )
+            {
+                *tmpptr = (char)tolower( *tmpptr );
+                tmpptr++;
+            }
+        
+            if (IsRealCluster)
+            {
+                // Extract the domain portion of the name if any
+                memset( str1, 0, TC_PROCESSOR_NAME_MAX );
+                memset( short_node_name, 0, TC_PROCESSOR_NAME_MAX );
+                strcpy( str1, new_n );
+    
+                char *str1_dot = strchr( (char *) str1, '.' );
+                if ( str1_dot )
+                {
+                    memcpy( short_node_name, str1, str1_dot - str1 );
+                    // copy the domain portion and skip the '.'
+                    strcpy( new_domain, str1_dot+1 );
+                }
+                else
+                {
+                    strcpy(short_node_name, str1 );
+                    new_domain[0] = 0;
+                }
+    
+                strcpy(new_n, short_node_name);
+    
+            }
+    
+            if (trace_settings & TRACE_PROCESS)
+            {
+                trace_printf( "%s@%d name=%s, domain=%s\n"
+                              , method_name, __LINE__
+                              , new_n
+                              , new_domain );
+            }
+    
             // Update the node name in the configuration database
             if (clusterConfig->UpdatePNodeConfig( pnodeConfig->GetPNid()
-                    , new_n
-                    , pnodeConfig->GetExcludedFirstCore()
-                    , pnodeConfig->GetExcludedLastCore() ))
+                                                , new_n
+                                                , new_domain
+                                                , pnodeConfig->GetExcludedFirstCore()
+                                                , pnodeConfig->GetExcludedLastCore() ))
             {
                 // lock sync thread since we are making a change the monitor's
                 // operational view of the cluster
@@ -3007,6 +3417,7 @@
 }
 
 
+#ifndef NAMESERVER_PROCESS
 CIntNodeAddReq::CIntNodeAddReq( int req_nid
                               , int req_pid
                               , Verifier_t req_verifier
@@ -3060,16 +3471,55 @@
     int nid;
     int pnid;
     int rc = MPI_SUCCESS;
+    char node_name[TC_PROCESSOR_NAME_MAX];
+    char domain_name[TC_PROCESSOR_NAME_MAX];
     CProcess *requester = NULL;
 
-    if (trace_settings & (TRACE_SYNC | TRACE_PROCESS))
+    char short_node_name[TC_PROCESSOR_NAME_MAX];
+    char str1[TC_PROCESSOR_NAME_MAX];
+    char *tmpptr = NULL;
+
+    tmpptr = nodeName_;
+    while ( *tmpptr )
+    {
+        *tmpptr = (char)tolower( *tmpptr );
+        tmpptr++;
+    }
+
+    if (IsRealCluster)
+    {
+        // Extract the domain portion of the name if any
+        memset( str1, 0, TC_PROCESSOR_NAME_MAX );
+        memset( short_node_name, 0, TC_PROCESSOR_NAME_MAX );
+        strcpy( str1, nodeName_ );
+
+        char *str1_dot = strchr( (char *) str1, '.' );
+        if ( str1_dot )
+        {
+            memcpy( short_node_name, str1, str1_dot - str1 );
+            // copy the domain portion and skip the '.'
+            strcpy( domain_name, str1_dot+1 );
+        }
+        else
+        {
+            strcpy(short_node_name, str1 );
+            domain_name[0] = 0;
+        }
+
+        strcpy(node_name, short_node_name);
+
+    }
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY| TRACE_REQUEST | TRACE_REQUEST_DETAIL))
     {
         trace_printf("%s@%d - Node add request (%s), "
-                     "node_name=%s, first_core=%d, last_core=%d, "
+                     "node_name=%s, domain_name=%s, "
+                     "first_core=%d, last_core=%d, "
                      "processors=%d, roles=%d\n"
                     , method_name, __LINE__
                     , requester ? requester->GetName() : ""
-                    , nodeName_
+                    , node_name
+                    , domain_name
                     , first_core_
                     , last_core_
                     , processors_
@@ -3084,7 +3534,8 @@
 
     // Insert node in configuration database and
     // add to configuration object in monitor
-    if (clusterConfig->SaveNodeConfig( nodeName_
+    if (clusterConfig->SaveNodeConfig( node_name
+                                     , domain_name
                                      , nid
                                      , pnid
                                      , first_core_
@@ -3094,6 +3545,8 @@
                                      , -1 // excludedLastCore
                                      , roles_ ))
     {
+        ZClient->ConfiguredZNodeCreate( node_name);
+
         // lock sync thread since we are making a change the monitor's
         // operational view of the cluster
         if ( !Emulate_Down )
@@ -3181,8 +3634,11 @@
                     , pnid_ );
 
     CClusterConfig *clusterConfig = Nodes->GetClusterConfig();
+    CPNodeConfig   *pnodeConfig =  clusterConfig->GetPNodeConfig( pnid_ ) ;
     if (clusterConfig->DeleteNodeConfig( pnid_ ))
     {
+        ZClient->ConfiguredZNodeDelete( pnodeConfig->GetName() );
+
         // lock sync thread
         if ( !Emulate_Down )
         {
@@ -3217,6 +3673,7 @@
 
     TRACE_EXIT;
 }
+#endif
 
 CIntDownReq::CIntDownReq( int pnid )
     : CInternalReq(),
@@ -3276,79 +3733,6 @@
     TRACE_EXIT;
 }
 
-CIntSoftNodeDownReq::CIntSoftNodeDownReq( int pnid )
-                    : CInternalReq()
-                    , pnid_ ( pnid )
-{
-    // Add eyecatcher sequence as a debugging aid
-    memcpy(&eyecatcher_, "RQIX", 4);
-}
-
-CIntSoftNodeDownReq::~CIntSoftNodeDownReq()
-{
-    // Alter eyecatcher sequence as a debugging aid to identify deleted object
-    memcpy(&eyecatcher_, "rqix", 4);
-}
-
-void CIntSoftNodeDownReq::populateRequestString( void )
-{
-    char strBuf[MON_STRING_BUF_SIZE/2];
-    sprintf( strBuf, "IntReq(%s) req #=%ld (pnid=%d)"
-                   , CReqQueue::intReqType[InternalType_SoftNodeDown]
-                   , getId(), pnid_ );
-    requestString_.assign( strBuf );
-}
-
-void CIntSoftNodeDownReq::performRequest()
-{
-    const char method_name[] = "CIntSoftNodeDownReq::performRequest";
-    TRACE_ENTRY;
-
-    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST))
-        trace_printf("%s@%d - Node soft down request, pnid=%d\n",
-                     method_name, __LINE__, pnid_);
-    Monitor->SoftNodeDown( pnid_ );
-
-    TRACE_EXIT;
-}
-
-CIntSoftNodeUpReq::CIntSoftNodeUpReq( int pnid )
-                  : CInternalReq()
-                  , pnid_ ( pnid )
-{
-    // Add eyecatcher sequence as a debugging aid
-    memcpy(&eyecatcher_, "RQIY", 4);
-
-}
-
-CIntSoftNodeUpReq::~CIntSoftNodeUpReq()
-{
-    // Alter eyecatcher sequence as a debugging aid to identify deleted object
-    memcpy(&eyecatcher_, "rqiy", 4);
-}
-
-void CIntSoftNodeUpReq::populateRequestString( void )
-{
-    char strBuf[MON_STRING_BUF_SIZE/2];
-    sprintf( strBuf, "IntReq(%s) req #=%ld (pnid=%d)"
-                   , CReqQueue::intReqType[InternalType_SoftNodeUp]
-                   , getId(), pnid_ );
-    requestString_.assign( strBuf );
-}
-
-void CIntSoftNodeUpReq::performRequest()
-{
-    const char method_name[] = "CIntSoftNodeUpReq::performRequest";
-    TRACE_ENTRY;
-
-    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST))
-        trace_printf("%s@%d - Soft node up request, pnid=%d\n",
-                     method_name, __LINE__, pnid_ );
-    Monitor->SoftNodeUpPrepare( pnid_ );
-
-    TRACE_EXIT;
-}
-
 CIntUpReq::CIntUpReq( int pnid, char *node_name, int merge_lead )
     : CInternalReq(),
       nodeName_ ( node_name?node_name:"" ),
@@ -3419,6 +3803,14 @@
     const char method_name[] = "CIntActivateSpareReq::performRequest";
     TRACE_ENTRY;
 
+    if (trace_settings & TRACE_INIT)
+    {
+        trace_printf( "%s@%d - Activating spare pnid=%d, spareNode_=%p, "
+                      "downNode_=%p\n"
+                    , method_name, __LINE__
+                    , spareNode_->GetPNid() , spareNode_, downNode_ );
+    }
+
     if ( downNode_ == NULL )
     {
         Monitor->NodeReady(spareNode_);
@@ -3465,6 +3857,8 @@
 
     int error;
 
+    Monitor->EnterSyncCycle();
+
     if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
         trace_printf("%s@%d - Revive request\n", method_name, __LINE__);
 
@@ -3626,18 +4020,22 @@
     if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
         trace_printf("%s@%d - Node zids unpacked\n", method_name, __LINE__);
 
+    Config->UnpackRegistry(buffer, (header.clusterRegistryCount_ + header.processRegistryCount_));
+
+    if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
+        trace_printf("%s@%d - Registry unpacked\n", method_name, __LINE__);
+       
+    Config->UnpackUniqueStrings(buffer, header.uniqueStringCount_);
+    
+    if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
+        trace_printf("%s@%d - Unique Strings unpacked\n", method_name, __LINE__);
+       
     // unpack process objects and create clones
     Monitor->UnpackProcObjs(buffer, header.procCount_);
 
     if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
         trace_printf("%s@%d - Process Objects unpacked\n", method_name, __LINE__);
 
-    Config->UnpackRegistry(buffer, (header.clusterRegistryCount_ + header.processRegistryCount_));
-    Config->UnpackUniqueStrings(buffer, header.uniqueStringCount_);
-    
-    if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
-        trace_printf("%s@%d - Registry unpacked\n", method_name, __LINE__);
-       
     mem_log_write(MON_REQQUEUE_REVIVE_5);
 
     // process the requests that were deferred to the revive side queue.
@@ -3654,6 +4052,8 @@
     // we are in the new monitor, and this will drive the state change
     MyNode->SetChangeState( true );
 
+    Monitor->ExitSyncCycle();
+
     TRACE_EXIT;
 
     return;
@@ -3774,6 +4174,12 @@
 #ifndef NAMESERVER_PROCESS
     // pack the current TM leader
     header.tmLeader_ = Monitor->GetTmLeader();
+    
+    // pack license verifiers
+    for (int index = 0; index < 3; index++)
+    {
+      header.verifiers_[index] = -1;
+    }
 #endif
 
     // pack spareNodes pnids
@@ -3783,14 +4189,14 @@
     header.nodeMapCount_ = Nodes->PackNodeMappings( (intBuffPtr_t&)buf );
 
     Nodes->PackZids( (intBuffPtr_t&)buf );
-
-    // pack process objects
-    header.procCount_ = Monitor->PackProcObjs(buf);
-
+    
     header.clusterRegistryCount_ =  Config->PackRegistry(buf, ConfigType_Cluster);
     header.processRegistryCount_ =  Config->PackRegistry(buf, ConfigType_Process);
     header.uniqueStringCount_   =  Config->PackUniqueStrings(buf);
     
+    // pack process objects
+    header.procCount_ = Monitor->PackProcObjs(buf);
+
     mem_log_write(MON_REQQUEUE_SNAPSHOT_6, header.nodeMapCount_, header.procCount_);
 
     header.fullSize_ = buf - snapshotBuf;
@@ -4138,22 +4544,20 @@
             {
                 startNs = true;
             }
-            if ( !MyNode->IsSoftNodeUp() )
-            {  // Don't restart the name server on a soft node up
-                if ( startNs )
-                {
-                    NameServer->SetLocalHost();
-                    MyNode->StartNameServerProcess();
-                }
-            }
-            else
+            if ( startNs )
             {
-                MyNode->ResetSoftNodeUp();
+                NameServer->SetLocalHost();
+                MyNode->StartNameServerProcess();
             }
         }
         MyNode->StartWatchdogProcess();
         MyNode->StartPStartDProcess();
-        char *env = getenv( "SQ_SEAMONSTER" );
+        char *env = getenv( "MON_DTM_PRIMITIVE_DISABLE" );
+        if ( env == NULL || (env && strcmp( env, "0" ) == 0) )
+        {
+            MyNode->StartDtmProcess();
+        }
+        env = getenv( "SQ_SEAMONSTER" );
         if ( env && strcmp( env, "1" ) == 0 )
         {
             MyNode->StartSMServiceProcess();
@@ -4330,6 +4734,11 @@
             request->setConcurrent(reqConcurrent[msg->u.request.type]);
             break;
 
+        case ReqType_InstanceId:
+            request = new CExtInstanceIdReq(msgType, pid, msg);
+            request->setConcurrent(reqConcurrent[msg->u.request.type]);
+            break;
+
         case ReqType_NewProcess:
             request = new CExtNewProcReq(msgType, nid, pid, -1, msg);
             request->setConcurrent(reqConcurrent[msg->u.request.type]);
@@ -4453,12 +4862,6 @@
             request = new CExtTmReadyReq(msgType, pid, msg);
             request->setConcurrent(reqConcurrent[msg->u.request.type]);
             break;
-
-        case ReqType_TmSync:
-            request = new CExtTmSyncReq(msgType, pid, msg);
-            request->setConcurrent(reqConcurrent[msg->u.request.type]);
-            break;
-
         case ReqType_ZoneInfo:
             request = new CExtZoneInfoReq(msgType, pid, msg);
             request->setConcurrent(reqConcurrent[msg->u.request.type]);
@@ -4466,8 +4869,6 @@
 
         case ReqType_OpenInfo:
         case ReqType_Notice:
-        case ReqType_TransInfo:
-        case ReqType_Stfsd:
 #endif
         default:
             // Invalid request type
@@ -4481,42 +4882,6 @@
             request = NULL;
         }
     }
-#ifndef NAMESERVER_PROCESS
-    else if (msg && msg->type == MsgType_UnsolicitedMessage)
-    {
-        if ( msg->u.reply.type == ReplyType_TmSync )
-        {
-            // This is a reply to an UnsolicitedMessage/TmSync request to the
-            // DTM.  This needs to be handled immediately rather than
-            // being queued and processed later.  That's because the
-            // TmSync operations master could be blocked waiting for
-            // and so queueing the request would be ineffective.
-
-            // Record statistics (sonar counters)
-            if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
-               MonStats->msg_type_unsolicited_Incr();
-
-            if (trace_settings & (TRACE_REQUEST | TRACE_TMSYNC))
-               trace_printf("%s@%d - TmSync reply\n", method_name, __LINE__);
-            Monitor->ProcessTmSyncReply ( msg );
-
-            // Signal client so local io buffer can be freed
-            int error;
-            SQ_theLocalIOToClient->sendCtlMsg ( pid,
-                                                MC_ReadySend,
-                                                ((SharedMsgDef*)msg)->
-                                                trailer.index,
-                                                &error
-                                                );
-        }
-        else
-        {
-            char           buf[MON_STRING_BUF_SIZE];
-            sprintf(buf, "[%s], Unknown reply type.\n", method_name);
-            mon_log_write(MON_REQQUEUE_PREP_EXT_REQ_1, SQ_LOG_ERR, buf);
-        }
-    }
-#endif
 
     else if (msgType == CExternalReq::ShutdownWork)
     {
@@ -4758,6 +5123,7 @@
     enqueueReq ( request );
 }
 
+#ifndef NAMESERVER_PROCESS
 void CReqQueue::enqueueNodeAddReq( int req_nid
                                  , int req_pid
                                  , Verifier_t req_verifier
@@ -4801,6 +5167,7 @@
 
     enqueueReq ( request );
 }
+#endif
 
 void CReqQueue::enqueueDownReq( int pnid )
 {
@@ -4830,26 +5197,6 @@
     enqueueReq ( request );
 }
 
-void CReqQueue::enqueueSoftNodeDownReq( int pnid )
-{
-    CInternalReq * request;
-
-    request = new CIntSoftNodeDownReq ( pnid );
-
-    request->setPriority(CRequest::High);
-
-    enqueueReq ( request );
-}
-
-void CReqQueue::enqueueSoftNodeUpReq( int pnid )
-{
-    CInternalReq * request;
-
-    request = new CIntSoftNodeUpReq ( pnid );
-
-    enqueueReq ( request );
-}
-
 void CReqQueue::enqueueShutdownReq( int level )
 {
     CInternalReq * request;
@@ -4872,6 +5219,42 @@
 }
 
 #ifndef NAMESERVER_PROCESS
+void CReqQueue::enqueueDumpCompleteReq( struct dump_def *dumpDef )
+{
+    CIntDumpCompleteReq * request;
+
+    request = new CIntDumpCompleteReq();
+    request->prepRequest( dumpDef );
+
+    enqueueReq ( request );
+}
+#endif
+
+#ifndef NAMESERVER_PROCESS
+void CReqQueue::enqueueDumpReq( struct dump_def *dumpDef )
+{
+    CIntDumpReq * request;
+
+    request = new CIntDumpReq();
+    request->prepRequest( dumpDef );
+
+    enqueueReq ( request );
+}
+#endif
+
+#ifndef NAMESERVER_PROCESS
+void CReqQueue::enqueueEventReq( struct event_def *eventDef )
+{
+    CIntEventReq * request;
+
+    request = new CIntEventReq();
+    request->prepRequest( eventDef );
+
+    enqueueReq ( request );
+}
+#endif
+
+#ifndef NAMESERVER_PROCESS
 void CReqQueue::enqueueExitReq( struct exit_def *exitDef )
 {
     CIntExitReq * request;
@@ -5070,6 +5453,8 @@
 
     request = new CIntCreatePrimitiveReq( pnid );
 
+    request->setPriority(CRequest::High);
+
     enqueueReq ( request );
 }
 #endif
@@ -5114,7 +5499,7 @@
         unsigned long long reqSeqNum = request->getSeqNum();
 
         // move requests whose seq num is above the minSeqNum, discard others.
-        if (reqSeqNum > minSeqNum)
+        if (reqSeqNum >= minSeqNum)
         {
             enqueueReq( request, true );
 
@@ -5575,18 +5960,15 @@
    false,    // ReqType_Shutdown
    false,    // ReqType_ShutdownNs
    false,    // ReqType_Startup
-   false,    // ReqType_Stfsd
    false,    // ReqType_TmLeader
    false,    // ReqType_TmReady
-   false,    // ReqType_TmSync
-   false,    // ReqType_TransInfo
    false,    // ReqType_ZoneInfo
    false     // ReqType_Invalid
 };
 
 // Request names used for trace output
 const char * CReqQueue::svcReqType[] = {
-    "",                 // unused, request types start at 1
+    "INVALID",          // unused, request types start at 1
     "Close",            // ReqType_Close
     "DelProcessNs",     // ReqType_DelProcessNs
     "Dump",             // ReqType_Dump
@@ -5622,20 +6004,18 @@
     "Shutdown",         // ReqType_Shutdown
     "ShutdownNs",       // ReqType_ShutdownNs
     "Startup",          // ReqType_Startup
-    "Stfsd",            // ReqType_Stfsd
     "TmLeader",         // ReqType_TmLeader
     "TmReady",          // ReqType_TmReady
-    "TmSync",           // ReqType_TmSync
-    "TransInfo",        // ReqType_TransInfo
-    "ZoneInfo"          // ReqType_ZoneInfo
-    "Invalid"           // ReqType_Invalid
+    "ZoneInfo",         // ReqType_ZoneInfo
+    "INVALID"           // ReqType_Invalid
 };
 
 // Must match internal.h:InternalType
 const char * CReqQueue::intReqType[] = {
-      ""                  // InternalType_Null
+      "INVALID"           // InternalType_Null
     , "ActivateSpare"     // InternalType_ActivateSpare
     , "Clone"             // InternalType_Clone
+    , "CreatePrimitives"  // InternalType_CreatePrimitives
     , "Device"            // InternalType_Device
     , "Down"              // InternalType_Down
     , "Dump"              // InternalType_Dump
@@ -5652,25 +6032,22 @@
     , "NodeDeleted"       // InternalType_NodeDeleted
     , "NodeName"          // InternalType_NodeName
     , "Notify"            // InternalType_Notify
+    , "Open"              // InternalType_Open
     , "PersistAdd"        // InternalType_PersistAdd
     , "PersistDelete"     // InternalType_PersistDelete
+    , "PostQuiesce"       // InternalType_PostQuiece
     , "Process"           // InternalType_Process
     , "ProcessInit"       // InternalType_ProcessInit
-    , "Open"              // InternalType_Open
-    , "Set"               // InternalType_Set
-    , "StdinReq"          // InternalType_StdinReq
-    , "Sync"              // InternalType_Sync
-    , "Up"                // InternalType_Up
-    , "CreatePrimitives"  // InternalType_CreatePrimitives
     , "Quiesce"           // InternalType_Quiesce
-    , "PostQuiesce"       // InternalType_PostQuiece
     , "Revive"            // InternalType_Revive
-    , "Snapshot"          // InternalType_Snapshot
-    , "UniqStr"           // InternalType_UniqStr
-    , "TMReady"           // InternalType_TmReady
-    , "Shutdown"          // InternalType_Shutdown
     , "SchedData"         // InternalType_SchedData
-    , "SoftNodeDown"      // InternalType_SoftNodeDown
-    , "SoftNodeUp"        // InternalType_SoftNodeUp
+    , "Set"               // InternalType_Set
+    , "Shutdown"          // InternalType_Shutdown
+    , "Snapshot"          // InternalType_Snapshot
+    , "StdinReq"          // InternalType_StdinReq
+    , "TMReady"           // InternalType_TmReady
+    , "Up"                // InternalType_Up
+    , "UniqStr"           // InternalType_UniqStr
+    , "INVALID"           // InternalType_Invalid
 };
 
diff --git a/core/sqf/monitor/linux/reqqueue.h b/core/sqf/monitor/linux/reqqueue.h
index b600a0f..5a3f6c9 100644
--- a/core/sqf/monitor/linux/reqqueue.h
+++ b/core/sqf/monitor/linux/reqqueue.h
@@ -303,6 +303,21 @@
 };
 #endif
 
+#ifndef NAMESERVER_PROCESS
+class CExtInstanceIdReq: public CExternalReq
+{
+public:
+    CExtInstanceIdReq (reqQueueMsg_t msgType, int pid,
+                       struct message_def *msg );
+    virtual ~CExtInstanceIdReq();
+
+    void performRequest();
+
+private:
+    void populateRequestString( void );
+};
+#endif
+
 class CExtKillReq: public CExternalReq
 {
 public:
@@ -491,6 +506,7 @@
 #ifndef NAMESERVER_PROCESS
 class CExtNameServerDeleteReq: public CExternalReq
 {
+
 public:
     CExtNameServerDeleteReq (reqQueueMsg_t msgType, int pid,
                              struct message_def *msg );
@@ -788,21 +804,6 @@
 #endif
 
 #ifndef NAMESERVER_PROCESS
-class CExtTmSyncReq: public CExternalReq
-{
-public:
-    CExtTmSyncReq (reqQueueMsg_t msgType, int pid,
-                   struct message_def *msg );
-    virtual ~CExtTmSyncReq();
-
-    void performRequest();
-
-private:
-    void populateRequestString( void );
-};
-#endif
-
-#ifndef NAMESERVER_PROCESS
 class CExtZoneInfoReq: public CExternalReq
 {
 public:
@@ -973,6 +974,53 @@
 #endif
 
 #ifndef NAMESERVER_PROCESS
+class CIntDumpCompleteReq: public CInternalReq
+{
+public:
+    CIntDumpCompleteReq();
+    virtual ~CIntDumpCompleteReq();
+
+    void prepRequest( struct dump_def *dumpDef );
+    void performRequest();
+
+private:
+    void populateRequestString( void );
+
+    int nid_;
+    int pid_;
+    Verifier_t verifier_;
+    int dumperNid_;
+    int dumperPid_;
+    Verifier_t dumperVerifier_;
+    char coreFile_[MAX_FILE_NAME];
+    DUMPSTATUS status_;
+};
+#endif
+
+#ifndef NAMESERVER_PROCESS
+class CIntDumpReq: public CInternalReq
+{
+public:
+    CIntDumpReq();
+    virtual ~CIntDumpReq();
+
+    void prepRequest( struct dump_def *dumpDef );
+    void performRequest();
+
+private:
+    void populateRequestString( void );
+
+    int nid_;
+    int pid_;
+    Verifier_t verifier_;
+    int dumperNid_;
+    int dumperPid_;
+    Verifier_t dumperVerifier_;
+    char coreFile_[MAX_FILE_NAME];
+};
+#endif
+
+#ifndef NAMESERVER_PROCESS
 class CIntExitReq: public CInternalReq
 {
 public:
@@ -1034,6 +1082,34 @@
 #endif
 
 #ifndef NAMESERVER_PROCESS
+class CIntEventReq: public CInternalReq
+{
+public:
+    CIntEventReq();
+    virtual ~CIntEventReq();
+
+    void prepRequest( struct event_def *eventDef );
+    void performRequest();
+
+    void * operator new(size_t size);
+    void operator delete(void *deadObject, size_t size);
+
+private:
+    void populateRequestString( void );
+
+    int eventId_;
+    int length_;
+    int targetNid_;
+    int targetPid_;
+    Verifier_t targetVerifier_;
+
+    enum {SMALL_DATA_SIZE=50};
+    char data_[SMALL_DATA_SIZE];
+    char *bigData_;
+};
+#endif
+
+#ifndef NAMESERVER_PROCESS
 class CIntIoDataReq: public CInternalReq
 {
 public:
@@ -1413,6 +1489,7 @@
     string new_name_;
 };
 
+#ifndef NAMESERVER_PROCESS
 class CIntNodeAddReq: public CInternalReq
 {
 public:
@@ -1461,6 +1538,7 @@
     Verifier_t req_verifier_;
     int  pnid_;
 };
+#endif
 
 class CIntDownReq: public CInternalReq
 {
@@ -1476,34 +1554,6 @@
     int pnid_;
 };
 
-class CIntSoftNodeDownReq: public CInternalReq
-{
-public:
-    CIntSoftNodeDownReq( int pnid );
-    virtual ~CIntSoftNodeDownReq();
-
-    void performRequest();
-
-private:
-    void populateRequestString( void );
-
-    int pnid_;
-};
-
-class CIntSoftNodeUpReq: public CInternalReq
-{
-public:
-    CIntSoftNodeUpReq( int pnid );
-    virtual ~CIntSoftNodeUpReq();
-
-    void performRequest();
-
-private:
-    void populateRequestString( void );
-
-    int pnid_;
-};
-
 class CIntUpReq: public CInternalReq
 {
 public:
@@ -1636,6 +1686,9 @@
     void enqueueCloneReq( struct clone_def *cloneDef );
 #ifndef NAMESERVER_PROCESS
     void enqueueDeviceReq( char *ldevName );
+    void enqueueDumpCompleteReq( struct dump_def *dumpDef );
+    void enqueueDumpReq( struct dump_def *dumpDef );
+    void enqueueEventReq( struct event_def *eventDef );
 #endif
 #ifndef NAMESERVER_PROCESS
     void enqueueExitReq( struct exit_def *exitDef );
@@ -1675,6 +1728,7 @@
                                    , int req_pid
                                    , Verifier_t req_verifier
                                    , char *node_name );
+#ifndef NAMESERVER_PROCESS
     void enqueueNodeAddReq( int req_nid
                           , int req_pid
                           , Verifier_t req_verifier
@@ -1687,14 +1741,13 @@
                              , int req_pid
                              , Verifier_t req_verifier
                              , int pnid );
+#endif
     void enqueueDownReq( int pnid );
     void enqueueNodeNameReq( int req_nid
                            , int req_pid
                            , Verifier_t req_verifier
                            , char *current_name
                            , char *new_name);
-    void enqueueSoftNodeDownReq( int pnid );
-    void enqueueSoftNodeUpReq( int pnid );
     void enqueueShutdownReq( int level );
     void enqueueActivateSpareReq( CNode *spareNode, CNode *downNode, bool checkHealth=false );
     void enqueueUpReq( int pnid, char *node_name, int merge_lead );
@@ -1773,12 +1826,14 @@
 /* CRequest eyecatcher_ assignments:
 
    CInternalReq:
-
       RQIA   CIntAttachedDeathReq
       RQIB   CPostQuiesceReq
       RQIC   CIntChildDeathReq
       RQID   CIntDeviceReq
+      RqIC   CIntDumpCompleteReq
+      RqID   CIntDumpReq
       RQIE   CIntExitReq
+      RqIE   CIntEventReq
       RQIF   CIntUniqStrReq
       RQIG   CIntSnapshotReq
       RQIH   CIntShutdownReq
@@ -1799,8 +1854,8 @@
       RQIU   CQuiesceReq
       RQIV   CIntTmReadyReq
       RQIW   CIntCreatePrimitiveReq
-      RQIX   CIntSoftNodeDownReq
-      RQIY   CIntSoftNodeUpReq
+      RQIX   -
+      RQIY   -
       RQIZ   CIntNodeNameReq
       RqIA   CIntNameServerAddReq
       RqIB   CIntNameServerDeleteReq
@@ -1837,7 +1892,7 @@
       RqER   CExtShutdownNsReq
       RQES   CExtStartupReq
       RQET   CExtTmLeaderReq
-      RQEV   CExtTmSyncReq
+      RQEV   CExtInstanceIdReq
       RQEW   CExtZoneInfoReq
       RQEX   CExtNodeAddReq
       RQEY   CExtNodeDeleteReq
diff --git a/core/sqf/monitor/linux/reqtmleader.cxx b/core/sqf/monitor/linux/reqtmleader.cxx
index 9d4fb96..9b0db84 100644
--- a/core/sqf/monitor/linux/reqtmleader.cxx
+++ b/core/sqf/monitor/linux/reqtmleader.cxx
@@ -160,14 +160,24 @@
                 }
             }
 
-            assert(process); 
-
-            // populate the TM leader process info
-            msg_->u.reply.type = ReplyType_Generic;
-            msg_->u.reply.u.generic.nid = process->GetNid();
-            msg_->u.reply.u.generic.pid = process->GetPid();
-            msg_->u.reply.u.generic.verifier = process->GetVerifier();
-            strcpy (msg_->u.reply.u.generic.process_name, process->GetName());
+            if (process)
+            {
+                // populate the TM leader process info
+                msg_->u.reply.type = ReplyType_Generic;
+                msg_->u.reply.u.generic.nid = process->GetNid();
+                msg_->u.reply.u.generic.pid = process->GetPid();
+                msg_->u.reply.u.generic.verifier = process->GetVerifier();
+                strcpy (msg_->u.reply.u.generic.process_name, process->GetName());
+            }
+            else
+            {
+                tmLeaderNid = -1;
+                msg_->u.reply.type = ReplyType_Generic;
+                msg_->u.reply.u.generic.nid = -1;
+                msg_->u.reply.u.generic.pid = -1;
+                msg_->u.reply.u.generic.verifier = -1;
+                msg_->u.reply.u.generic.process_name[0] = 0;
+            }
 
             if (process && NameServerEnabled)
             {
diff --git a/core/sqf/monitor/linux/reqtmsync.cxx b/core/sqf/monitor/linux/reqtmsync.cxx
deleted file mode 100644
index 50bf6a3..0000000
--- a/core/sqf/monitor/linux/reqtmsync.cxx
+++ /dev/null
@@ -1,100 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////
-//
-// @@@ START COPYRIGHT @@@
-//
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// @@@ END COPYRIGHT @@@
-//
-///////////////////////////////////////////////////////////////////////////////
-
-#include <stdio.h>
-#include "reqqueue.h"
-#include "montrace.h"
-#include "monsonar.h"
-#include "monlogging.h"
-
-extern CMonStats *MonStats;
-extern CMonitor *Monitor;
-
-CExtTmSyncReq::CExtTmSyncReq (reqQueueMsg_t msgType, int pid,
-                              struct message_def *msg )
-    : CExternalReq(msgType, pid, msg)
-{
-    // Add eyecatcher sequence as a debugging aid
-    memcpy(&eyecatcher_, "RQEV", 4);
-}
-
-CExtTmSyncReq::~CExtTmSyncReq()
-{
-    // Alter eyecatcher sequence as a debugging aid to identify deleted object
-    memcpy(&eyecatcher_, "rqev", 4);
-}
-
-void CExtTmSyncReq::populateRequestString( void )
-{
-    char strBuf[MON_STRING_BUF_SIZE/2] = { 0 };
-
-    snprintf( strBuf, sizeof(strBuf), 
-              "ExtReq(%s) req #=%ld requester(pid=%d) (nid=%d)"
-              , CReqQueue::svcReqType[reqType_], getId(), pid_
-              , msg_->u.request.u.tm_sync.nid );
-    requestString_.assign( strBuf );
-}
-
-
-void CExtTmSyncReq::performRequest()
-{
-    const char method_name[] = "CExtTmSyncReq::performRequest";
-    TRACE_ENTRY;
-
-    int         handle;
-    CTmSyncReq *tmsync_req;
-
-    // Record statistics (sonar counters)
-    if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
-       MonStats->req_type_tmsync_Incr();
-
-    // Trace info about request
-    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
-    {
-        trace_printf("%s@%d request #%ld: TmSync, nid=%d, tag=%d, length=%d\n",
-                     method_name, __LINE__, id_, msg_->u.request.u.tm_sync.nid,
-                     msg_->u.request.u.tm_sync.tag,
-                     msg_->u.request.u.tm_sync.length);
-    }
-
-    handle = Monitor->GetHandle();
-    tmsync_req = Monitor->Q_TmSync( msg_->u.request.u.tm_sync.nid,
-                                    handle,
-                                    msg_->u.request.u.tm_sync.data,
-                                    msg_->u.request.u.tm_sync.length, 
-                                    msg_->u.request.u.tm_sync.tag,
-                                    false );
-    msg_->u.reply.type = ReplyType_TmSync;
-    msg_->u.reply.u.tm_sync.nid = -1;
-    msg_->u.reply.u.tm_sync.pid = 0;
-    msg_->u.reply.u.tm_sync.handle = handle;
-    msg_->u.reply.u.tm_sync.return_code = MPI_SUCCESS;
-    tmsync_req->Completed = true;
-
-    // Send reply to requester
-    lioreply(msg_, pid_);
-
-    TRACE_EXIT;
-}
diff --git a/core/sqf/monitor/linux/sdtimer.cxx b/core/sqf/monitor/linux/sdtimer.cxx
index 1438bec..06ce235 100644
--- a/core/sqf/monitor/linux/sdtimer.cxx
+++ b/core/sqf/monitor/linux/sdtimer.cxx
@@ -48,10 +48,6 @@
 #include "gentrap.h"
 
 #define LUNMGR_RETRY_MAX           3
-// The following defines specify the default values for the timers if the 
-// softdog timer related variables are not defined.
-#define SDT_KeepAliveTimerDefault  5    // in seconds
-
 
 extern CWatchdog       *Watchdog;
 extern CProcessMonitor *ProcessMonitor;
@@ -101,7 +97,7 @@
          ,dumpMonitor_(false)
          ,killingNode_(false)
          ,softdog_(false)
-         ,sdtKeepAliveTimerValue_(SDT_KeepAliveTimerDefault)
+         ,sdtKeepAliveTimerValue_(WDT_KEEPALIVETIMERDEFAULT)
          ,threadId_(0)
          ,sdtLastMonRefreshCtr_(0)
 {
@@ -440,18 +436,25 @@
                     }
                     else
                     {
-                        DumpMonitorProcess();
-
-                        char buf[MON_STRING_BUF_SIZE];
-                        snprintf( buf, sizeof(buf), "Node %d going down, "
-                                  "failed to get refresh event from monitor\n",
-                                  gv_ms_su_nid);
-                        genSnmpTrap( buf );
-
-                        NodeFailSafe( timerExpired );
-                        StopSoftdogTimer();
-                        Watchdog->SetNodeDown();
-                        Watchdog->CLock::wakeOne();
+                        if( getenv("SQ_VIRTUAL_NODES") )
+                        { // Ignore expired timer in virtual cluster
+                            ResetSoftdogTimer( timeout );
+                        }
+                        else
+                        {
+                            DumpMonitorProcess();
+    
+                            char buf[MON_STRING_BUF_SIZE];
+                            snprintf( buf, sizeof(buf), "Node %d going down, "
+                                      "failed to get refresh event from monitor\n",
+                                      gv_ms_su_nid);
+                            genSnmpTrap( buf );
+    
+                            NodeFailSafe( timerExpired );
+                            StopSoftdogTimer();
+                            Watchdog->SetNodeDown();
+                            Watchdog->CLock::wakeOne();
+                        }
                     }
                 }
                 break;
@@ -493,6 +496,13 @@
         {
             trace_printf( "%s@%d Timer started!\n", method_name, __LINE__ );
         }
+
+        char la_buf[MON_STRING_BUF_SIZE];
+        sprintf( la_buf
+               , "[%s], KeepAlive Timer in seconds = %ld\n"
+               , method_name, sdtKeepAliveTimerValue_ );
+        monproc_log_write( MON_SDTIMER_STARTSOFTDOGTIMER_1, SQ_LOG_INFO, la_buf);
+    
         clock_gettime(CLOCK_REALTIME, &expiredTime_);
         expiredTime_.tv_sec += sdtKeepAliveTimerValue_;
         SetSoftdog( true );
diff --git a/core/sqf/monitor/linux/shell.cxx b/core/sqf/monitor/linux/shell.cxx
index c36c70b..47149c1 100644
--- a/core/sqf/monitor/linux/shell.cxx
+++ b/core/sqf/monitor/linux/shell.cxx
@@ -45,6 +45,7 @@
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <string>
+#include <netdb.h>
 
 #include "msgdef.h"
 #include "props.h"
@@ -72,7 +73,7 @@
 #define TRACE_SHELL_CMD         0x00001
 
 #define MAX_TOKEN   132
-#define MAX_BUFFER  132
+#define MAX_BUFFER  512
 #define MAX_CMDLINE 256
 #define MAX_DEATH_SAVE 10
 
@@ -102,6 +103,10 @@
 bool SpareNodeColdStandby = true;
 bool ElasticityEnabled = true;
 bool NameServerEnabled = false;
+bool QuietShell = false;
+bool NodeAddUseFqdn = true;
+
+AgentType_t AgentType = AgentType_Undefined;
 
 int   lastDeathNid[MAX_DEATH_SAVE] = { -1,-1,-1,-1,-1,-1,-1,-1,-1,-1 };
 int   lastDeathPid[MAX_DEATH_SAVE] = { -1,-1,-1,-1,-1,-1,-1,-1,-1,-1 };
@@ -112,6 +117,7 @@
 
 bool  nodePending = false;
 char  nodePendingName[MPI_MAX_PROCESSOR_NAME];
+int   nodePendingNid;
 int   nodePendingPnid;
 CLock nodePendingLock;
 
@@ -168,8 +174,9 @@
 void  node_delete_cmd( char *cmd );
 void  node_down( int nid, char *reason );
 void  node_down_cmd ( char *cmd );
+void  node_info_cmd( char *cmd );
 void  node_name_cmd( char *cmd );
-int   node_up( int nid, char *node_name, bool nowait=false );
+int   node_up( int nid, bool nowait=false );
 void  node_up_cmd( char *cmd, char delimiter );
 char *normalize_case (char *token);
 void  normalize_slashes (char *token);
@@ -325,12 +332,18 @@
     case State_Initializing:
         str = "Initializing";
         break;
+    case State_Merging:
+        str = "Merging";
+        break;
     case State_Merged:
         str = "Merged";
         break;
     case State_Joining:
         str = "Joining";
         break;
+    case State_Takeover:
+        str = "Takeover";
+        break;
     default:
         str = "Unknown";
     }
@@ -543,7 +556,7 @@
         // Set initial state of all physical nodes in a real cluster to StateDown
         // update_cluster_state() will set operational state of physical node
         NodeState_t nodeState = StateDown;
-        physicalNode = new CPhysicalNode( pnodeConfig->GetName(), nodeState );
+        physicalNode = new CPhysicalNode( pnodeConfig->GetFqdn(), nodeState );
         if ( physicalNode )
         {
             pnmit = PhysicalNodeMap.insert( PhysicalNodeNameMap_t::value_type
@@ -902,7 +915,7 @@
 {
     // Determine trace file name
     const char *tmpDir;
-    tmpDir = getenv( "MPI_TMPDIR" );
+    tmpDir = getenv( "TRAF_LOG" );
 
     const char *envVar;
     envVar = getenv("SHELL_TRACE_FILE");
@@ -1279,22 +1292,22 @@
     switch (recv_msg->type )
     {
     case MsgType_Change:
-        printf ("[%s] %s - Configuration Change Notice for Group: %s Key: %s Value: %s\n", 
-                MyName, time_string(),
-                recv_msg->u.request.u.change.group,
-                recv_msg->u.request.u.change.key,
-                recv_msg->u.request.u.change.value);
+        if (! QuietShell) printf ("[%s] %s - Configuration Change Notice for Group: %s Key: %s Value: %s\n", 
+                                  MyName, time_string(),
+                                  recv_msg->u.request.u.change.group,
+                                  recv_msg->u.request.u.change.key,
+                                  recv_msg->u.request.u.change.value);
         break;
 
     case MsgType_Event:
-        printf("[%s] %s - Event %d received\n",
-               MyName, time_string(), recv_msg->u.request.u.event_notice.event_id);
+        if (! QuietShell) printf("[%s] %s - Event %d received\n",
+                                 MyName, time_string(), recv_msg->u.request.u.event_notice.event_id);
         break;
-
+    
     case MsgType_NodeAdded:
-        printf ("[%s] %s - Node %d (%s) ADDED to configuration\n",
-                MyName, time_string(), recv_msg->u.request.u.node_added.nid,
-                recv_msg->u.request.u.node_added.node_name);
+        if (! QuietShell) printf ("[%s] %s - Node %d (%s) ADDED to configuration\n", 
+                                  MyName, time_string(), recv_msg->u.request.u.node_added.nid,
+                                  recv_msg->u.request.u.node_added.node_name);
         if ( !load_configuration() )
         {
             exit (1);
@@ -1313,9 +1326,9 @@
         break;
 
     case MsgType_NodeChanged:
-        printf ("[%s] %s - Node %d (%s) CHANGED in configuration\n",
-                MyName, time_string(), recv_msg->u.request.u.node_changed.nid,
-                recv_msg->u.request.u.node_changed.node_name);
+        if (! QuietShell) printf ("[%s] %s - Node %d (%s) CHANGED in configuration\n", 
+                                  MyName, time_string(), recv_msg->u.request.u.node_changed.nid,
+                                  recv_msg->u.request.u.node_changed.node_name);
         if ( !load_configuration() )
         {
             exit (1);
@@ -1334,9 +1347,9 @@
         break;
 
     case MsgType_NodeDeleted:
-        printf ("[%s] %s - Node %d (%s) DELETED from configuration\n",
-                MyName, time_string(), recv_msg->u.request.u.node_deleted.nid,
-                recv_msg->u.request.u.node_deleted.node_name);
+        if (! QuietShell) printf ("[%s] %s - Node %d (%s) DELETED from configuration\n", 
+                                  MyName, time_string(), recv_msg->u.request.u.node_deleted.nid,
+                                  recv_msg->u.request.u.node_deleted.node_name);
         if ( !load_configuration() )
         {
             exit (1);
@@ -1355,17 +1368,26 @@
         break;
 
     case MsgType_NodeDown:
-        printf ("[%s] %s - Node %d (%s) is DOWN\n", 
-                MyName, time_string(), recv_msg->u.request.u.down.nid,
-                recv_msg->u.request.u.down.node_name );
+        if (! QuietShell) printf ("[%s] %s - Node %d (%s) is DOWN\n", 
+                                  MyName, time_string(), recv_msg->u.request.u.down.nid,
+                                  recv_msg->u.request.u.down.node_name );
         NodeState[recv_msg->u.request.u.down.nid] = false;
 
         if ( nodePending )
         {
-            if ( strcmp( nodePendingName, recv_msg->u.request.u.down.node_name) == 0 )
-            {   // The node that was supposed to come up had some problem
-                // and went down.
-                nodePendingComplete();
+            if ( VirtualNodes )
+            {
+                if (recv_msg->u.request.u.down.nid == nodePendingNid)
+                {
+                    nodePendingComplete();
+                }
+            }
+            else
+            {
+                if ( strcmp( nodePendingName, recv_msg->u.request.u.down.node_name) == 0 )
+                {
+                    nodePendingComplete();
+                }
             }
         }
         if ( waitDeathPending )
@@ -1380,25 +1402,17 @@
 
 
     case MsgType_NodeJoining:
-        printf ("[%s] %s - Node %s %s\n"
-                , MyName
-                , time_string()
-                , recv_msg->u.request.u.joining.node_name
-                , join_phase_string(recv_msg->u.request.u.joining.phase) );
-        break;
-
-
-    case MsgType_NodePrepare:
-        printf("[%s] %s - Node %s (%d) node-up preparation, takeover=%s\n",
-               MyName, time_string(), recv_msg->u.request.u.prepare.node_name,
-               recv_msg->u.request.u.prepare.nid,
-               ((recv_msg->u.request.u.prepare.takeover)? "true": "false"));
+        if (! QuietShell) printf ("[%s] %s - Node %s %s\n"
+                                  , MyName
+                                  , time_string()
+                                  , recv_msg->u.request.u.joining.node_name 
+                                  , join_phase_string(recv_msg->u.request.u.joining.phase) );
         break;
 
     case MsgType_NodeQuiesce:
-        printf ("[%s] %s - Node %d (%s) is QUIESCEd\n", 
-                MyName, time_string(), msg->u.request.u.quiesce.nid,
-                msg->u.request.u.quiesce.node_name );
+        if (! QuietShell) printf ("[%s] %s - Node %d (%s) is QUIESCEd\n", 
+                                  MyName, time_string(), msg->u.request.u.quiesce.nid,
+                                  msg->u.request.u.quiesce.node_name );
         NodeState[msg->u.request.u.quiesce.nid] = false;
         if ( waitDeathPending )
         {
@@ -1410,10 +1424,10 @@
         break;
 
     case MsgType_NodeUp:
-        printf ("[%s] %s - Node %d (%s) is UP\n",
-                MyName, time_string(), recv_msg->u.request.u.up.nid,
-                recv_msg->u.request.u.up.node_name);
-        NodeState[recv_msg->u.request.u.down.nid] = true;
+        if (! QuietShell) printf ("[%s] %s - Node %d (%s) is UP\n", 
+                                  MyName, time_string(), recv_msg->u.request.u.up.nid,
+                                  recv_msg->u.request.u.up.node_name);
+        NodeState[recv_msg->u.request.u.down.nid] = true;        
         if ( nodePending )
         {
             if ( strcmp( nodePendingName, recv_msg->u.request.u.up.node_name) == 0 )
@@ -1426,34 +1440,34 @@
     case MsgType_ProcessCreated:
         if ( recv_msg->u.request.u.process_created.return_code == MPI_SUCCESS )
         {
-            printf ("[%s] %s - Process %s successfully created. Nid=%d, Pid=%d\n",
-                    MyName, time_string(), recv_msg->u.request.u.process_created.process_name,
-                    recv_msg->u.request.u.process_created.nid,
-                    recv_msg->u.request.u.process_created.pid);
+            if (! QuietShell) printf ("[%s] %s - Process %s successfully created. Nid=%d, Pid=%d\n",
+                                      MyName, time_string(), recv_msg->u.request.u.process_created.process_name,
+                                      recv_msg->u.request.u.process_created.nid,
+                                      recv_msg->u.request.u.process_created.pid);
         }
         else
         {
-            printf ("[%s] %s - Process %s NOT created. Nid=%d, Pid=%d\n",
-                    MyName, time_string(), recv_msg->u.request.u.process_created.process_name,
-                    recv_msg->u.request.u.process_created.nid,
-                    recv_msg->u.request.u.process_created.pid);
+            if (! QuietShell) printf ("[%s] %s - Process %s NOT created. Nid=%d, Pid=%d\n",
+                                      MyName, time_string(), recv_msg->u.request.u.process_created.process_name,
+                                      recv_msg->u.request.u.process_created.nid,
+                                      recv_msg->u.request.u.process_created.pid);
         }
         break;
 
     case MsgType_ProcessDeath:
         if ( recv_msg->u.request.u.death.aborted )
         {
-            printf ("[%s] %s - Process %s abnormally terminated. Nid=%d, Pid=%d\n",
-                    MyName, time_string(), recv_msg->u.request.u.death.process_name, 
-                    recv_msg->u.request.u.death.nid,
-                    recv_msg->u.request.u.death.pid);
+            if (! QuietShell) printf ("[%s] %s - Process %s abnormally terminated. Nid=%d, Pid=%d\n",
+                                      MyName, time_string(), recv_msg->u.request.u.death.process_name, 
+                                      recv_msg->u.request.u.death.nid,
+                                      recv_msg->u.request.u.death.pid);
         }
         else
         {
-            printf ("[%s] %s - Process %s terminated normally. Nid=%d, Pid=%d\n", 
-                    MyName, time_string(), recv_msg->u.request.u.death.process_name, 
-                    recv_msg->u.request.u.death.nid,
-                    recv_msg->u.request.u.death.pid);
+            if (! QuietShell) printf ("[%s] %s - Process %s terminated normally. Nid=%d, Pid=%d\n", 
+                                      MyName, time_string(), recv_msg->u.request.u.death.process_name, 
+                                      recv_msg->u.request.u.death.nid,
+                                      recv_msg->u.request.u.death.pid);
         }
         for ( int dinx = 0; dinx < MAX_DEATH_SAVE; dinx++ )
         {
@@ -1475,43 +1489,33 @@
         break;
 
     case MsgType_SpareUp:
-        printf ("[%s] %s - Node %s is Spare Node and available\n"
-                , MyName
-                , time_string()
-                , recv_msg->u.request.u.spare_up.node_name );
+        if (! QuietShell) printf ("[%s] %s - Node %s is Spare Node and available\n"
+                                  , MyName
+                                  , time_string()
+                                  , recv_msg->u.request.u.spare_up.node_name );
         nodePendingComplete();
         break;
 
     case MsgType_Shutdown:
-        printf("[%s] %s - Shutdown notice, level=%d received\n",
-               MyName, time_string(), recv_msg->u.request.u.shutdown.level);
+        if (! QuietShell) printf("[%s] %s - Shutdown notice, level=%d received\n",
+                                 MyName, time_string(), recv_msg->u.request.u.shutdown.level);
         nodePendingComplete();
         break;
-
-    case MsgType_TmSyncAbort:
-        printf("[%s] %s - TmSync abort notice received\n",
-               MyName, time_string());
-        break;
-    case MsgType_TmSyncCommit:
-        printf("[%s] %s - TmSync commit notice received\n",
-               MyName, time_string());
-        break;
-
     case MsgType_ReintegrationError:
-        printf ("[%s] %s - %s\n"
-                , MyName
-                , time_string()
-                , recv_msg->u.request.u.reintegrate.msg );
+        if (! QuietShell) printf ("[%s] %s - %s\n"
+                                  , MyName
+                                  , time_string()
+                                  , recv_msg->u.request.u.reintegrate.msg );
         nodePendingComplete();
         break;
 
     default:
-        printf("[%s] %s - Unexpected notice type(%d) received\n",
-               MyName, time_string(), recv_msg->type);
+        if (! QuietShell) printf("[%s] %s - Unexpected notice type(%d) received\n",
+                                 MyName, time_string(), recv_msg->type);
 
     }
 
-    printf( "%s", prompt );
+    if (! QuietShell) printf( "%s", prompt );
     fflush( stdout );
 }
 
@@ -1722,6 +1726,22 @@
     return( up );
 }
 
+bool is_node_up( int nid )
+{
+    char node_name[MAX_TOKEN] = { 0 };
+    int lv_nid = nid;
+    int pnid;
+    int zid = -1;
+    STATE state;
+
+    if ( !get_zone_state( lv_nid, zid, node_name, pnid, state ) )
+    {
+        return( false );
+    }
+
+    return( (state == State_Up) ? true : false );
+}
+
 void exit_process (void)
 {
     int count;
@@ -2094,6 +2114,28 @@
     while (!done);
 }
 
+int get_node_name_by_nid( int nid, char *node_name )
+{
+    int pnid;
+
+    CPNodeConfig   *pnodeConfig;
+    CLNodeConfig   *lnodeConfig;
+
+    lnodeConfig = ClusterConfig.GetLNodeConfig( nid );
+    if ( !lnodeConfig )
+    {
+        return( -1 );
+    }
+    pnodeConfig = lnodeConfig->GetPNodeConfig();
+    if ( !pnodeConfig )
+    {
+        return( -1 );
+    }
+    strcpy( node_name, pnodeConfig->GetFqdn() );
+
+    return( 0 );
+}
+
 bool get_nameserver_by_node_name( char *node_name )
 {
     CNameServerConfig *config;
@@ -2129,19 +2171,8 @@
     switch (persistConfig->GetZoneZidFormat())
     {
     case Zid_ALL:
-        for (int i = 0; i < LNodesConfigMax; i++)
-        {
-            if ( i == 0 )
-            {
-                sprintf( zoneStr, "%d", i );
-                strcpy( persistZones, zoneStr );
-            }
-            else
-            {
-                sprintf( zoneStr, ",%d", i );
-                strcat( persistZones, zoneStr );
-            }
-        }
+        sprintf( zoneStr, "%d (ALL)", -1 );
+        strcat( persistZones, zoneStr );
         break;
     case Zid_RELATIVE:
         sprintf( zoneStr, "%d", nid );
@@ -2872,7 +2903,66 @@
     return( -1 );
 }
 
-int get_node_name( char *node_name )
+int get_fqdn_by_name( char *nodeName, char *fqdn )
+{
+    int rc;
+    struct addrinfo hints;
+    struct addrinfo *result, *rp;
+
+    memset(&hints, 0, sizeof(struct addrinfo));
+    hints.ai_family = AF_INET;      // Allow IPv4 only 
+    hints.ai_socktype = 0;          // Any socktype
+    hints.ai_flags = 0;
+    hints.ai_protocol = 0;          // Any protocol
+    hints.ai_canonname = NULL;
+    hints.ai_addr = NULL;
+    hints.ai_next = NULL;
+ 
+    // getaddrinfo() returns a list of address structures.
+    rc = getaddrinfo( nodeName, NULL, &hints, &result);
+    if (rc != 0) 
+    {
+        fprintf( stderr
+               , "Could not resolve host address, getaddrinfo(%s): %s\n"
+               , nodeName, gai_strerror(rc) );
+        return( -1 );
+    }
+
+    socklen_t saLen;
+    struct sockaddr *sa;
+    char hbuf[NI_MAXHOST];
+
+    for (rp = result; rp != NULL; rp = rp->ai_next) 
+    {
+        sa = rp->ai_addr;
+        saLen = rp->ai_addrlen;
+        rc = getnameinfo(sa, saLen, hbuf, sizeof(hbuf), NULL, 0, NI_NAMEREQD);
+        if (rc != 0)
+        {
+            fprintf( stderr
+                   , "Could not resolve hostname, getnameinfo(%s, NI_NAMEREQD): %s\n"
+                   , nodeName, gai_strerror(rc) );
+            continue;
+        }
+        else
+        {
+            break; // Good one, we're done!
+        }
+    }
+
+    freeaddrinfo(result);   // No longer needed
+
+    if (rp == NULL)
+    {
+        return( -1 );
+    }
+
+    strcpy( fqdn, hbuf );
+
+    return( 0 );
+}
+
+int get_node_name( char *node_name, char *short_node_name )
 {
     CPNodeConfig   *pnodeConfig;
 
@@ -2881,6 +2971,10 @@
     {
         if ( CPNodeConfigContainer::hostnamecmp( node_name, pnodeConfig->GetName() ) == 0 )
         {
+            if (short_node_name)
+            {
+                strcpy( short_node_name, pnodeConfig->GetName() );
+            }
             return( 0 );
         }
     }
@@ -2888,6 +2982,29 @@
     return( -1 );
 }
 
+int get_short_node_name( char *node_name, char *short_node_name )
+{
+    if ( !node_name ) return( -1 );
+    if ( !short_node_name ) return( -1 );
+
+    char str1[1024];
+    memset( str1, 0, 1024 );
+
+    char *str1_dot = strchr( (char *) node_name, '.' );
+    if ( str1_dot )
+    { // Found '.', copy up to one char before '.'
+        memcpy( str1, node_name, str1_dot - node_name );
+    }
+    else
+    { // Copy entire string
+        strcpy( str1, node_name );
+    }
+
+    strcpy( short_node_name, str1 );
+
+    return( 0 );
+}
+
 bool get_more_proc_info(PROCESSTYPE process_type, bool allNodes)
 {
     bool replyOk = false;
@@ -3357,7 +3474,7 @@
             {
                 if (!VirtualNodes)
                 {
-                    strcpy( PNode[pnodeConfig->GetPNid()], pnodeConfig->GetName() );
+                    strcpy( PNode[pnodeConfig->GetPNid()], pnodeConfig->GetFqdn() );
                 }
                 else
                 {
@@ -4091,7 +4208,7 @@
                 printf( "node-id=%d, node-name=%s, "
                         "cores=%s, processors=%d, roles=%s\n"
                       , lnodeConfig->GetNid()
-                      , lnodeConfig->GetName()
+                      , lnodeConfig->GetFqdn()
                       , coresString
                       , lnodeConfig->GetProcessors()
                       , RoleTypeString( lnodeConfig->GetZoneType() )
@@ -4267,12 +4384,24 @@
 void node_down( int nid, char *reason )
 {
     const char method_name[] = "node_down";
+    int pnid = -1;
     char msgString[MAX_BUFFER] = { 0 };
+    char node_name[MAX_TOKEN] = { 0 };
 
     if ( trace_settings & TRACE_SHELL_CMD )
         trace_printf("%s@%d [%s] sending down node message.\n",
                      method_name, __LINE__, MyName);
 
+    if ( get_node_name_by_nid( nid, node_name ) != 0 )
+    {
+        sprintf( msgString, "[%s] Invalid node id!\n", MyName);
+        write_startup_log( msgString );
+        printf ("[%s] Invalid node id!\n", MyName);
+        return;
+    }
+    
+    pnid = get_pnid_by_nid( nid );
+
     if ( gp_local_mon_io->acquire_msg( &msg ) != 0 )
     {   // Could not acquire a message buffer
         sprintf( msgString, "[%s] Unable to acquire message buffer.\n", MyName);
@@ -4285,11 +4414,34 @@
     msg->noreply = true;
     msg->u.request.type = ReqType_NodeDown;
     msg->u.request.u.down.nid = nid;
-    STRCPY(msg->u.request.u.down.node_name, Node[nid]);
+    STRCPY(msg->u.request.u.down.node_name, node_name);
     STRCPY(msg->u.request.u.down.reason, reason);
 
     gp_local_mon_io->send( msg );
 
+    struct sigaction int_act, old_act;
+    int_act.sa_sigaction = interrupt_handler;
+    sigemptyset(&int_act.sa_mask);
+    sigaddset (&int_act.sa_mask, SIGINT);
+    int_act.sa_flags = SA_SIGINFO;
+    sigaction (SIGINT, &int_act, &old_act);
+
+    nodePending = true;
+    nodePendingNid = nid;
+    nodePendingPnid = pnid;
+    STRCPY(nodePendingName, node_name);
+
+    if ( trace_settings & TRACE_SHELL_CMD )
+        trace_printf( "%s@%d [%s] Waiting for node down notice, node_name=%s\n",
+                      method_name, __LINE__, MyName, node_name );
+
+    nodePendingLock.lock();
+    nodePendingLock.wait();
+    nodePendingLock.unlock();
+
+    sigaction (SIGINT, &old_act, NULL);
+
+    nodePending = false;
     NodeState[nid] = false;
 }
 
@@ -4470,17 +4622,26 @@
     gp_local_mon_io->release_msg(msg);
 }
 
-int node_up( int nid, char *node_name, bool nowait )
+int node_up( int nid, bool nowait )
 {
     const char method_name[] = "node_up";
     bool integrating = false;
-    int pnid;
+    int pnid = -1;
     int rc = -1;
     char msgString[MAX_BUFFER] = { 0 };
+    char node_name[MAX_TOKEN] = { 0 };
 
     // If this is a real cluster
-    if ( nid == -1 )
+    if ( !VirtualNodes )
     {
+        if ( get_node_name_by_nid( nid, node_name ) != 0 )
+        {
+            sprintf( msgString, "[%s] Invalid node id!\n", MyName);
+            write_startup_log( msgString );
+            printf ("[%s] Invalid node id!\n", MyName);
+            return( rc ) ;
+        }
+        
         // Get current physical state of all nodes
         if ( !update_node_state( node_name, false ) )
         {
@@ -4587,7 +4748,7 @@
     msg->u.request.u.up.nid = nid;
 
     // If this is a real cluster
-    if ( nid == -1 )
+    if ( !VirtualNodes )
     {
         if ( trace_settings & TRACE_SHELL_CMD )
             trace_printf( "%s@%d [%s] %s node up successful, rtn=%d\n ",
@@ -5102,7 +5263,6 @@
         }
         break;
     case Nid_Undefined:
-        nid = 0;
         get_persist_process_attributes( persistConfig
                                       , -1
                                       , process_type
@@ -5130,6 +5290,14 @@
             sprintf( programNameAndArgs, "%s"
                    , persistConfig->GetProgramName() );
         }
+        // Find the first up nid
+        for ( nid = 0; nid < NumLNodes; nid++ )
+        {
+            if (is_node_up( nid ))
+            {
+                break;
+            }
+        }
         snprintf(outpath, MAX_FILE_NAME, "%s/%s", getenv("TRAF_LOG"), outfile);
         pid = start_process( &nid
                            , process_type
@@ -5757,17 +5925,17 @@
     // do it here so that variables can be overwritten
     char **xvals = NULL;
     MON_Props xprops(true);
-    strcpy (sqvar, getenv("TRAF_VAR"));
-    char *envfile = new char [strlen(sqvar)+11];
+    strcpy (sqvar, getenv("TRAF_CONF"));
+    char *envfile = new char [strlen(sqvar)+20];
     strcpy(envfile, sqvar);
-    strcat(envfile, "/shell.env");
+    strcat(envfile, "/monitor.env");
     xprops.load(envfile);
     delete [] envfile;
     MON_Smap_Enum xenum(&xprops);
     int xsize = xprops.size();
     int xinx;
     if (xsize > 0) {
-        printf("[%s] - Warning using shell.env\n",MyName);
+        printf("[%s] - Warning using monitor.env\n",MyName);
         xvals = new char*[2*xsize];
         xinx = 0;
         while (xenum.more())
@@ -6324,13 +6492,13 @@
     {
         dir = getenv("SQ_SNAPSHOT_DIR");
         if (dir == NULL)
-            dir = getenv("PWD");
+            dir = getenv("TRAF_LOG");
     }
     // convert to absolute path
     if (dir[0] == '/')
         strcpy(path, dir);
     else
-        sprintf(path, "%s/%s", getenv("PWD"), dir);
+        sprintf(path, "%s/%s", getenv("TRAF_LOG"), dir);
 
     if (*cmd_tail)
     {
@@ -6404,7 +6572,7 @@
                     trace_printf("%s@%d [%s] dumped process successfully. "
                                  "error=%s\n", method_name, __LINE__, MyName,
                                  ErrorMsg(msg->u.reply.u.dump.return_code));
-                printf("dump file created: %s\n",
+                printf("dump file created@ %s\n",
                        msg->u.reply.u.dump.core_file);
             }
             else
@@ -7492,21 +7660,13 @@
                 // [ <nid> ]
                 if ( *cmd )
                 {
-                    nid = atoi (cmd);
-                    pnid = get_pnid_by_nid( nid );
-                    if ( pnid == -1 )
-                    {
-                        printf( "[%s] Node id %d does not exist in configuration!\n"
-                              , MyName, nid );
-                        return;
-                    }
-                    node_info(nid);
+                    node_info_cmd( cmd );
                     CurNodes = NumLNodes-NumDown;
                 }
                 else
                 {
                     // display all nodes
-                    node_info(-1);
+                    node_info( -1 );
                     CurNodes = NumLNodes-NumDown;
                 }
             }
@@ -7571,6 +7731,8 @@
     bool process_cmd = false;
     char *cmd_tail = cmd;
     char name[MPI_MAX_PROCESSOR_NAME] = { 0 };
+    char node_name[MPI_MAX_PROCESSOR_NAME] = { 0 };
+    char fqdn_name[MPI_MAX_PROCESSOR_NAME] = { 0 };
     char token[MAX_TOKEN] = { 0 };
     int  first_core, last_core, processor_count, roles;
     char msgString[MAX_BUFFER] = { 0 };
@@ -7602,6 +7764,24 @@
             {
                 cmd_tail = get_token( cmd_tail, name, &delimiter, MPI_MAX_PROCESSOR_NAME-1, false );
                 //printf ("[%s] node-name=%s, delimeter=%c\n", MyName, name, delimiter);
+                if (NodeAddUseFqdn)
+                {
+                    if(get_fqdn_by_name( name, fqdn_name ) == -1)
+                    {
+                        fprintf( stderr
+                               , "Fully Qualified Domain Name not available for hostname %s\n"
+                               , name );
+                        return;
+                    }
+                    else
+                    {
+                        strncpy( node_name, fqdn_name, sizeof(node_name) );
+                    }
+                }
+                else
+                {
+                    strncpy( node_name, name, sizeof(node_name) );
+                }
             }
             else if (strcmp( token, "cores" ) == 0)
             {
@@ -7658,7 +7838,7 @@
         }
 
         // Check for required values (currently all but last_core are required)
-        if (name[0] != 0
+        if (node_name[0] != 0
          && first_core != -1
          && processor_count != -1
          && roles != 0)
@@ -7675,7 +7855,7 @@
 
     if ( process_cmd )
     {
-        node_add( name, first_core, last_core, processor_count, roles );
+        node_add( node_name, first_core, last_core, processor_count, roles );
     }
     else
     {
@@ -7692,6 +7872,7 @@
     char *cmd_tail = cmd;
     char delim;
     char token[MAX_TOKEN] = { 0 };
+    char short_node_name[MAX_TOKEN] = { 0 };
     int nid = -1;
     int pnid = -1;
 
@@ -7717,7 +7898,7 @@
         }
         else
         {
-            if ( get_node_name( token ) != 0 )
+            if ( get_node_name( token, short_node_name ) != 0 )
             {
                 printf( "[%s] Node %s does not exist in configuration!\n"
                       , MyName, token );
@@ -7726,7 +7907,7 @@
         }
     }
 
-    node_config( nid, token );
+    node_config( nid, short_node_name );
 }
 
 void node_delete_cmd( char *cmd )
@@ -7757,7 +7938,7 @@
         }
         else
         {
-            if ( get_node_name( token ) != 0 )
+            if ( get_node_name( token, NULL ) != 0 )
             {
                 sprintf( msgString, "[%s] Node %s does not exist in configuration!"
                        , MyName, token);
@@ -7848,7 +8029,7 @@
         write_startup_log( msgString );
         printf ("%s\n", msgString);
 
-        if ( get_node_name( token ) != 0 ) 
+        if ( get_node_name( token, NULL ) != 0 ) 
         {
             sprintf( msgString, "[%s] Node %s does not exist in configuration!"
                    , MyName, token);
@@ -7899,6 +8080,78 @@
     NodeState[nid] = false;
 }
 
+void node_info_cmd( char *cmd )
+{
+    const char method_name[] = "node_info_cmd";
+
+    char *cmd_tail = cmd;
+    char delim;
+    char token[MAX_TOKEN];
+    int  i;
+    int  nid;
+    char msgString[MAX_BUFFER] = { 0 };
+    char node_name[MAX_TOKEN] = { 0 };
+
+    if ( trace_settings & TRACE_SHELL_CMD )
+        trace_printf ("%s@%d [%s] processing node info command.\n",
+                      method_name, __LINE__, MyName);
+
+    if ( VirtualNodes )
+    {
+        get_token( cmd_tail, token, &delim );
+        if ( isNumeric( token ) )
+        {
+            i = atoi (token);
+            if ( (i < 0) || (i > (CurNodes - 1)) )
+            {
+                sprintf( msgString, "[%s] Invalid node id!",MyName);
+                write_startup_log( msgString );
+                printf ("%s\n", msgString);
+            }
+            else
+            {
+                // 1:1 mapping of virtual logical to physical nodes
+                node_info( i );
+            }
+        }
+        else
+        {
+            sprintf( msgString, "[%s] Invalid node id!",MyName);
+            write_startup_log( msgString );
+            printf ("%s\n", msgString);
+        }
+    }
+    else
+    {
+        get_token( cmd_tail, token, &delim );
+        if ( isNumeric( token ) )
+        {
+            nid = atoi (token);
+            if ( get_node_name_by_nid( nid, node_name ) != 0 )
+            {
+                sprintf( msgString, "[%s] Invalid node id!\n", MyName);
+                write_startup_log( msgString );
+                printf ("[%s] Invalid node id!\n", MyName);
+                return;
+            }
+        }
+        else
+        {
+            if ( get_node_name( token, NULL ) != 0 ) 
+            {
+                sprintf( msgString, "[%s] Node %s does not exist in configuration!"
+                       , MyName, token);
+                write_startup_log( msgString );
+                printf ("%s\n", msgString);
+                return;
+            }
+            STRCPY(node_name, token);
+            nid = get_first_nid( node_name );
+        }
+        node_info( nid );
+    }
+}
+
 void node_name_cmd( char *cmd )
 {
     const char method_name[] = "node_name_cmd";
@@ -7938,7 +8191,7 @@
         else
         {
             STRCPY(node_name, token);
-            if ( get_node_name( node_name ) != 0 )
+            if ( get_node_name( node_name, NULL ) != 0 )
             {
                 sprintf( msgString, "[%s] Node %s is not configured!"
                        , MyName, node_name);
@@ -7961,7 +8214,7 @@
                 printf ("%s\n", msgString );
                 return;
             }
-            if ( get_node_name( new_node_name ) == 0 )
+            if ( get_node_name( new_node_name, NULL ) == 0 )
             {
                 sprintf( msgString, "[%s] Node %s is already configured!"
                        , MyName, new_node_name);
@@ -7997,12 +8250,25 @@
     char delim;
     char token[MAX_TOKEN];
     int  i;
+    int  nid;
     char msgString[MAX_BUFFER] = { 0 };
+    char node_name[MAX_TOKEN] = { 0 };
 
     if ( trace_settings & TRACE_SHELL_CMD )
         trace_printf ("%s@%d [%s] processing up node command.\n",
                       method_name, __LINE__, MyName);
 
+    if (AgentType == AgentType_CM)
+    {
+        sprintf( msgString
+               , "[%s] Command 'node up' is not supported in Cloudera Manager "
+                 "installations! (You must use the node role start or restart action)"
+               , MyName );
+        write_startup_log( msgString );
+        printf ("%s\n", msgString);
+        return;
+    }
+
     if (*cmd && delimiter == '{')
     {
         process_cmd = false;
@@ -8068,7 +8334,7 @@
                 else
                 {
                     // 1:1 mapping of virtual logical to physical nodes
-                    node_up( i, Node[i] );
+                    node_up( i );
                 }
             }
             else
@@ -8096,7 +8362,7 @@
             }
             else
             {
-                if ( get_node_name( token ) == 0 ) 
+                if ( get_node_name( token, NULL ) == 0 ) 
                 {
                     if ( ClusterConfig.GetStorageType() == TCDBSQLITE)
                     {
@@ -8115,7 +8381,9 @@
                     return;
                 }
             }
-            node_up( -1, cmd_tail, nowait );
+            STRCPY(node_name, token);
+            nid = get_first_nid( node_name );
+            node_up( nid, nowait );
         }
     }
 }
@@ -8345,6 +8613,10 @@
                 {
                     printf ("[%s] Persist process exec of a SMS process type is not allowed!\n", MyName);
                 }
+                else if (persistConfig->GetProcessType() == ProcessType_DTM)
+                {
+                    printf ("[%s] Persist process exec of a DTM process type is not allowed!\n", MyName);
+                }
                 else if (persistConfig->GetRequiresDTM())
                 {
                     if (DTMexists)
@@ -9410,7 +9682,7 @@
 {
     // Determine trace file name
     const char *tmpDir;
-    tmpDir = getenv( "MPI_TMPDIR" );
+    tmpDir = getenv( "TRAF_LOG" );
 
     if (tmpDir)
     {
@@ -9452,6 +9724,7 @@
     bool exec_one_command = false;
     bool tty = true;
     char delimiter;
+    char *env;
     char *input_file;
     char *cmd_buffer;
     char token[MAX_TOKEN];
@@ -9520,6 +9793,20 @@
         MyNid = VirtualNid;
     }
 
+    env = getenv("TRAF_AGENT");
+    if ( env != NULL && strcmp(env, "CM") == 0 )
+    {
+        AgentType = AgentType_CM;
+    }
+    else if ( env != NULL && strcmp(env, "Ambari") == 0 )
+    {
+        AgentType = AgentType_Ambari;
+    }
+    else
+    {
+        AgentType = AgentType_MPI;
+    }
+
     msg = new struct message_def;
 
     // Load default node information
@@ -9557,7 +9844,7 @@
     // Initialize mpirun std file settings
     MpirunInit();
 
-    char *env = getenv("SQ_ELASTICY_ENABLED");
+    env = getenv("SQ_ELASTICY_ENABLED");
     if ( env && isdigit(*env) )
     {
         if ( strcmp(env,"0") == 0 )
@@ -9573,6 +9860,28 @@
         NameServerEnabled = (val != 0) ? true : false;
     }
 
+    env = getenv("SQ_QUIET_SHELL");
+    if ( env && isdigit(*env) )
+    {
+        if ( strcmp(env,"1") == 0 )
+        {
+          QuietShell = true;
+        }
+    }
+
+    env = getenv("SQ_NODE_ADD_USE_FQDN");
+    if ( env && isdigit(*env) )
+    {
+        if ( strcmp( env, "0" ) == 0 )
+        {
+            NodeAddUseFqdn = false;
+        }
+        else
+        {
+            NodeAddUseFqdn = true;
+        }
+    }
+
     if ( !VirtualNodes )
     {
         env = getenv("SQ_COLD_STANDBY_SPARE");
diff --git a/core/sqf/monitor/linux/testpoint.h b/core/sqf/monitor/linux/testpoint.h
index 058a36f..bf34854 100644
--- a/core/sqf/monitor/linux/testpoint.h
+++ b/core/sqf/monitor/linux/testpoint.h
@@ -63,7 +63,7 @@
            snprintf(buf, sizeof(buf), "[%s], Test point: %s, aborting\n",\
                     method_name, TPVAR);\
            mon_log_write(MON_CLUSTER_REINTEGRATE_10, SQ_LOG_ERR, buf);    \
-           MPI_Abort(MPI_COMM_SELF,99);\
+           mon_failure_exit(true);\
        }\
     }
 
diff --git a/core/sqf/monitor/linux/tmsync.cxx b/core/sqf/monitor/linux/tmsync.cxx
deleted file mode 100644
index b56c5f8..0000000
--- a/core/sqf/monitor/linux/tmsync.cxx
+++ /dev/null
@@ -1,1368 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////
-//
-// @@@ START COPYRIGHT @@@
-//
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// @@@ END COPYRIGHT @@@
-//
-///////////////////////////////////////////////////////////////////////////////
-
-#include <iostream>
-
-using namespace std;
-
-#include <signal.h>
-#include <fcntl.h>
-#include <sys/file.h>
-#include <errno.h>
-
-#include "monlogging.h"
-#include "montrace.h"
-#include "monitor.h"
-#include "lock.h"
-#include "clusterconf.h"
-#include "lnode.h"
-#include "pnode.h"
-#include "tmsync.h"
-#include "mlio.h"
-#include "reqqueue.h"
-#include "nameserver.h"
-
-extern bool NameServerEnabled;
-extern int trace_level;
-extern int MyPNID;
-extern sigset_t SigSet;
-extern CMonitor *Monitor;
-extern CNodeContainer *Nodes;
-extern CNode *MyNode;
-extern CReqQueue ReqQueue;
-
-CTmSyncReq::CTmSyncReq( int nid, int handle, char *data, int length, int tag, bool unsolicited )
-            :Nid( nid ),
-             Tag( tag ),
-             Handle( handle ),
-             Length( length ),
-             Unsolicited( unsolicited ),
-             Replicated( false ),
-             Completed( false ),
-             Next( NULL ),
-             Prev( NULL )
-{
-    const char method_name[] = "CTmSyncReq::CTmSyncReq";
-    TRACE_ENTRY;
-
-    // Add eyecatcher sequence as a debugging aid
-    memcpy(&eyecatcher_, "TSYN", 4);
-
-    Data = new char [Length+1];
-    memmove( Data, data, Length );
-    if (trace_settings & TRACE_TMSYNC)
-       trace_printf("%s@%d" " - Create " "%s"  "request handle=" "%d"  "\n", method_name, __LINE__, (Unsolicited?"unsolicited ":""), Handle);
-    TRACE_EXIT;
-}
-
-CTmSyncReq::~CTmSyncReq( void )
-{
-    const char method_name[] = "CTmSyncReq::~CTmSyncReq";
-    TRACE_ENTRY;
-    delete [] Data;
-    if (trace_settings & TRACE_TMSYNC)
-       trace_printf("%s@%d" " - Delete " "%s"  "request (%p) handle=" "%d"  "\n", method_name, __LINE__, (Unsolicited?"unsolicited ":""), this, Handle);
-
-    // Alter eyecatcher sequence as a debugging aid to identify deleted object
-    memcpy(&eyecatcher_, "tsyn", 4);
-
-    TRACE_EXIT;
-}
-
-void CTmSyncReq::DeLink (CTmSyncReq ** head, CTmSyncReq ** tail)
-{
-    const char method_name[] = "CTmSyncReq::DeLink";
-    TRACE_ENTRY;
-    if (*head == this)
-        *head = Next;
-    if (*tail == this)
-        *tail = Prev;
-    if (Prev)
-        Prev->Next = Next;
-    if (Next)
-        Next->Prev = Prev;
-    TRACE_EXIT;
-}
-
-CTmSyncReq *CTmSyncReq::GetNext (void)
-{
-    const char method_name[] = "CTmSyncReq::GetNext";
-    TRACE_ENTRY;
-    TRACE_EXIT;
-    return Next;
-}
-
-CTmSyncReq *CTmSyncReq::Link (CTmSyncReq * entry)
-{
-    const char method_name[] = "CTmSyncReq::Link";
-    TRACE_ENTRY;
-    Next = entry;
-    entry->Prev = this;
-
-    TRACE_EXIT;
-    return entry;
-}
-
-CTmSync_Container::CTmSync_Container(void)
-                  :CCluster(),
-                   TmSyncReplies( 0 ),
-                   ReqsInBlock( 0 ),
-                   TmSyncReplyCode( MPI_SUCCESS ),
-                   PendingSlaveTmSyncCount( 0 ),
-                   Head( NULL ),
-                   Tail( NULL ),
-                   HandleSeq(MyPNID*MAX_TM_HANDLES),
-                   PendingSlaveTmSync( false ),
-                   TotalSlaveTmSyncCount( false ),
-                   AbortPendingTmSync( false )
-{
-    const char method_name[] = "CTmSync_Container::CTmSync_Container";
-    TRACE_ENTRY;
-
-    int rc = sem_init(&UnsolicitedWaitSem, 0, 0);
-    if (rc)
-    {
-        int err = errno;
-        char la_buf[MON_STRING_BUF_SIZE];
-        sprintf(la_buf, "[%s], Can't create unnamed semaphore! - errno=%d (%s)\n", method_name, err, strerror(errno));
-        mon_log_write(MON_TMSYNC_INIT_1, SQ_LOG_ERR, la_buf);
-
-        abort();
-    }
-
-    TRACE_EXIT;
-}
-
-CTmSync_Container::~CTmSync_Container(void)
-{
-    CTmSyncReq *req = Head;
-
-    const char method_name[] = "CTmSync_Container::~CTmSync_Container";
-    TRACE_ENTRY;
-
-    while (req)
-    {
-        req->DeLink (&Head, &Tail);
-        delete req;
-        req = Head;
-    }
-
-    int rc = sem_destroy( &UnsolicitedWaitSem );
-    if (rc)
-    {
-        int err = errno;
-        char la_buf[MON_STRING_BUF_SIZE];
-        sprintf(la_buf, "[%s], Can't destroy unnamed semaphore! - errno=%d (%s)\n", method_name, err, strerror(errno));
-        mon_log_write(MON_TMSYNC_DEST_1, SQ_LOG_ERR, la_buf);
-
-        abort();
-    }
-
-    TRACE_EXIT;
-}
-
-void CTmSync_Container::UpdateTmSyncState( int return_code )
-{
-    char                la_buf[MON_STRING_BUF_SIZE];
- 
-    const char method_name[] = "CTmSync_Container::UpdateTmSyncState";
-    TRACE_ENTRY;
-    
-    if ( MyNode->GetTmSyncState() != SyncState_Abort )
-    {
-        if (( MyNode->GetTmSyncState() == SyncState_Start    ) ||
-            ( MyNode->GetTmSyncState() == SyncState_Continue ) ||
-            ( MyNode->GetTmSyncState() == SyncState_Commit   )   )
-        {
-            if ( return_code )
-            {
-                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                   trace_printf("%s@%d" " - TmSync aborted" "\n", method_name, __LINE__);
-                MyNode->SetTmSyncState( SyncState_Abort );
-            }
-            else
-            {
-                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                   trace_printf("%s@%d" " - TmSync commited" "\n", method_name, __LINE__);
-                MyNode->SetTmSyncState( SyncState_Commit );
-            }
-        }
-        else
-        {
-            sprintf(la_buf, "[%s], Invalid SyncState (%d)! \n", method_name, MyNode->GetTmSyncState());
-            mon_log_write(MON_TMSYNC_UPDATE_STATE_1, SQ_LOG_ERR, la_buf); 
-            MyNode->SetTmSyncState( SyncState_Abort );
-        }
-        if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-           trace_printf("%s@%d" " - Physical Node " "%d"  " TmSyncState updated (" "%d" ")" "\n", method_name, __LINE__, MyPNID, MyNode->GetTmSyncState());
-    }    
-
-    TRACE_EXIT;
-}
-
-void CTmSync_Container::CommitTmDataBlock( int return_code )
-{
-    SyncState           state;
-    char                la_buf[MON_STRING_BUF_SIZE];
- 
-    const char method_name[] = "CTmSync_Container::CommitTmDataBlock";
-    TRACE_ENTRY;
-    
-    UpdateTmSyncState( return_code );
-
-    // Loop here until the TM Sync has completed
-    while (1)
-    {
-        if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-           trace_printf("%s@%d" " - Getting all nodes TmSyncState\n", method_name, __LINE__);
-
-        ExchangeTmSyncState( false );
-        state = Nodes->GetTmState( SyncState_Commit );
-        if (( state == SyncState_Abort ) ||
-            ( state == SyncState_Null  )   )
-        {
-            if ( state == SyncState_Null )
-            {
-                sprintf(la_buf, "[%s], Early termination! \n", method_name);
-                mon_log_write(MON_TMSYNC_COMMITTMDATA_1, SQ_LOG_ERR, la_buf);
-            }
-            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-               trace_printf("%s@%d" " - TmSyncAbort Sent" "\n", method_name, __LINE__);
-            EndTmSync( MsgType_TmSyncAbort );
-            if ( AbortPendingTmSync )
-            {
-                Monitor->TmSyncAbortPending();
-            }
-            break;
-        }
-        else if ( state == SyncState_Commit )
-        {
-            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-               trace_printf("%s@%d" " - TmSyncCommit Sent" "\n", method_name, __LINE__);
-            EndTmSync( MsgType_TmSyncCommit );
-            break;
-        }
-        //usleep (5000);
-    }    
-    // End the TM sync processing cycle for my node.
-    MyNode->SetTmSyncState( SyncState_Null );
-    MyNode->SetTmSyncNid( -1 );
-    ExchangeTmSyncState( false );
-    if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-       trace_printf("%s@%d" " - Physical Node " "%d"  " TmSyncState updated (" "%d" ")" "\n", method_name, __LINE__, MyPNID, MyNode->GetTmSyncState());
-    
-    TRACE_EXIT;
-}
-
-int CTmSync_Container::CoordinateTmDataBlock ( struct sync_def *sync )
-{
-    const char method_name[] = "CTmSync_Container::CoordinateTmDataBlock";
-    TRACE_ENTRY;
-    if ( MyNode->GetState() == State_Down )
-    {
-        // For Virtual nodes: if we are down ... 
-        // just return and continue processing like normal
-        return MPI_SUCCESS;
-    }
-
-    if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-    {
-        CNode *node = Nodes->GetNode( sync->pnid );
-        if ( node )
-        {
-            trace_printf("%s@%d" " - Node %s (pnid=%d) TmSync initiated (sync nid=%d, sync state=%d)\n", method_name, __LINE__, node->GetName(), sync->pnid, node->GetTmSyncNid(), node->GetTmSyncState());
-        }
-    }
-    if ( sync->pnid == MyPNID )
-    {
-//        if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-//        {
-//           trace_printf("%s@%d" " - Node %s (pnid=%d) TmSync initiated (sync nid=%d, sync state=%d)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncNid(), MyNode->GetTmSyncState());
-//        }
-        if ( MyNode->GetTmSyncNid() == -1)
-        {
-            // Our physical node requested the TM sync
-            if ( Nodes->GetTmState( SyncState_Null ) != SyncState_Null)
-            {
-                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                   trace_printf("%s@%d" " - Tm Sync already pending" "\n", method_name, __LINE__);
-                return MPI_ERR_PENDING;
-            }
-            else
-            {
-                MyNode->SetTmSyncState( SyncState_Start );
-                MyNode->SetTmSyncNid( sync->syncnid );
-                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                {
-                   trace_printf("%s@%d" " - Master TmSync started" "\n", method_name, __LINE__);
-                   trace_printf("%s@%d" " - Physical Node %d TmSyncState updated (nid=%d, state=%d)\n", method_name, __LINE__, MyPNID, MyNode->GetTmSyncNid(), MyNode->GetTmSyncState());
-                }
-                syncCycle_.lock();
-                exchangeTmSyncData( sync, false );
-                syncCycle_.unlock();
-                ExchangeTmSyncState( false );
-                if (( Monitor->tmSyncPNid_ == MyPNID                           ) &&
-                    ( Nodes->GetTmState( SyncState_Start ) == SyncState_Start )   )
-                {
-                    // send unsolicited messages to other TMs in
-                    // local physical node and wait for them to reply
-                    if ( MyNode->GetLNodesCount() > 1 )
-                    {
-                        if ( PendingSlaveTmSync )
-                        {
-                            SendUnsolicitedMessages();
-                            while (1)
-                            {
-                                if ( GetTmSyncReplies() == GetTotalSlaveTmSyncCount() )
-                                {
-                                    break;
-                                }
-                                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                                    trace_printf("%s@%d" " - Master waiting for Local Unsolicited TmSync reply, total=%d, replies=%d, pending=%d\n", method_name, __LINE__, GetTotalSlaveTmSyncCount(), GetTmSyncReplies(), GetPendingSlaveTmSyncCount() );
-                                    UnsolicitedCompleteWait();
-                            }    
-                        }
-                    }
-                         
-                    // send reply to our TM for the sync request
-                    if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                       trace_printf("%s@%d" " - Master TmSync reply" "\n", method_name, __LINE__);
-                    CommitTmDataBlock(MPI_SUCCESS);
-                    return MPI_SUCCESS;
-                }
-                else
-                {
-                    if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                       trace_printf("%s@%d" " - Tm Sync failed to start, tmSyncPNid_=%d, MyPNID=%d, " "TmSyncState=%d, expecting=%d\n", method_name, __LINE__, tmSyncPNid_, MyPNID, Nodes->GetTmState( SyncState_Start ), SyncState_Start);
-                    if (MyNode->GetTmSyncState() == SyncState_Start)
-                    {
-                        MyNode->SetTmSyncState( SyncState_Null );
-                        if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                           trace_printf("%s@%d" " - Physical Node " "%d"  " TmSyncState updated (" "%d" ")" "\n", method_name, __LINE__, MyPNID, MyNode->GetTmSyncState());
-                    }       
-                    return MPI_ERR_PENDING;
-                }
-            }
-        }
-        else
-        {
-            // Another logical node in my physical node requested a TM sync
-            if ( MyNode->GetTmSyncState() == SyncState_Start )
-            {
-                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                {
-                   trace_printf("%s@%d" " - Slave TmSync started on local Physical Node "  "%d" "\n", method_name, __LINE__, sync->pnid);
-                   trace_printf("%s@%d" " - Physical Node " "%d"  " TmSyncState updated (" "%d" ")" "\n", method_name, __LINE__, MyPNID, MyNode->GetTmSyncState());
-                }
-                UnPackSyncData(sync);
-            }
-        }
-    }
-    else
-    {
-        // some other physical node requested a TM sync.
-        if ( MyNode->GetTmSyncState() == SyncState_Null )
-        {
-            // Send sync data to our node's TM
-            MyNode->SetTmSyncState( SyncState_Continue );
-            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-            {
-               trace_printf("%s@%d" " - Slave TmSync started on Physical Node "  "%d" "\n", method_name, __LINE__, sync->pnid);
-               trace_printf("%s@%d" " - Physical Node " "%d"  " TmSyncState updated (" "%d" ")" "\n", method_name, __LINE__, MyPNID, MyNode->GetTmSyncState());
-            }
-            UnPackSyncData(sync);
-            ExchangeTmSyncState( true );
-        }
-        else
-        {
-            MyNode->SetTmSyncState( SyncState_Abort );
-            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-            {
-               trace_printf("%s@%d" " - Collision with another TM trying to sync" "\n", method_name, __LINE__);
-               trace_printf("%s@%d" " - Physical Node " "%d"  " TmSyncState updated (" "%d" ")" "\n", method_name, __LINE__, MyPNID, MyNode->GetTmSyncState());
-            }
-        }    
-    }
-
-    TRACE_EXIT;
-    return 0;
-}
-
-void CTmSync_Container::EndTmSync( MSGTYPE type )
-{
-    CProcess           *mytm;
-    CTmSyncReq         *req = Head;
-    CTmSyncReq         *next;
-    CLNode             *lnode;
-    struct message_def *msg = NULL;
-    struct message_def *notice;
-    int                 count = 0;
-    int                 orig_count = 0;
-    char                la_buf[MON_STRING_BUF_SIZE];
-
-    const char method_name[] = "CTmSync_Container::EndTmSync";
-    TRACE_ENTRY;
-
-    // send a commit or abort notice to my node's TM
-    msg = new struct message_def;
-    msg->type = type;
-    msg->noreply = true;
-    msg->u.request.type = ReqType_Notice;
-    while (req)
-    {
-        next = req->GetNext();
-        if ( !req->Unsolicited )
-        {
-            // my node
-            if (trace_settings & TRACE_TMSYNC)
-            {
-                trace_printf("%s@%d - Original request (%p) nid=%d, handle=%d, tag=%d, unsol=%d, comp=%d\n", method_name, __LINE__, req, req->Nid, req->Handle, req->Tag, req->Unsolicited, req->Completed);
-            }
-            msg->u.request.u.tm_sync_notice.orig_tag[orig_count] = req->Tag;
-            msg->u.request.u.tm_sync_notice.orig_handle[orig_count] = req->Handle;
-            orig_count++;
-        }
-        if (( req->Replicated                 ) &&
-            ( req->Completed                  )   )
-        {
-            if (trace_settings & TRACE_TMSYNC)
-            {
-                trace_printf("%s@%d - Request (%p) nid=%d, handle=%d, tag=%d, unsol=%d, comp=%d\n", method_name, __LINE__, req, req->Nid, req->Handle, req->Tag, req->Unsolicited, req->Completed);
-            }
-            if ( tmSyncPNid_ == MyPNID )
-            {
-                if ( MyNode->GetLNodesCount() > 1 )
-                {
-                    if ( req->Unsolicited )
-                    {
-                        msg->u.request.u.tm_sync_notice.nid[count] = req->Nid;
-                        msg->u.request.u.tm_sync_notice.handle[count] = req->Handle;
-                        count++;
-                        if (trace_settings & TRACE_TMSYNC)
-                        {
-                            trace_printf("%s@%d - Count Unsolicited request (%p) nid=%d, handle=%d, tag=%d, count=%d\n", method_name, __LINE__, req, req->Nid, req->Handle, req->Tag, count);
-                        }
-                    }
-                }
-                else
-                {
-                    msg->u.request.u.tm_sync_notice.nid[count] = req->Nid;
-                    msg->u.request.u.tm_sync_notice.handle[count] = req->Handle;
-                    count++;
-                    if (trace_settings & TRACE_TMSYNC)
-                    {
-                        trace_printf("%s@%d - Count request (%p) nid=%d, handle=%d, tag=%d, count=%d\n", method_name, __LINE__, req, req->Nid, req->Handle, req->Tag, count);
-                    }
-                }
-            }
-            else
-            {
-                msg->u.request.u.tm_sync_notice.nid[count] = req->Nid;
-                msg->u.request.u.tm_sync_notice.handle[count] = req->Handle;
-                count++;
-                if (trace_settings & TRACE_TMSYNC)
-                {
-                    trace_printf("%s@%d - Count request (%p) nid=%d, handle=%d, tag=%d, count=%d\n", method_name, __LINE__, req, req->Nid, req->Handle, req->Tag, count);
-                }
-            }
-            req->DeLink(&Head,&Tail);
-            delete req;
-        }
-        req = next;
-    }
-    msg->u.request.u.tm_sync_notice.count = count;
-    msg->u.request.u.tm_sync_notice.orig_count = orig_count;
-
-    lnode = MyNode->GetFirstLNode();
-    for ( ; lnode  ; lnode = lnode->GetNextP() )
-    {
-        if ( lnode->GetState() == State_Up )
-        {
-            // the logical node is up and 
-            // is not the requesting logical node
-            mytm = lnode->GetProcessLByType( ProcessType_DTM );
-            if ( !mytm )
-            {
-                sprintf(la_buf, "[%s], Can't find TM in node=%d\n", method_name, lnode->GetNid());
-                mon_log_write(MON_TMSYNC_END, SQ_LOG_ERR, la_buf); 
-    
-                ReqQueue.enqueueDownReq(MyPNID);
-            }
-            else
-            {
-                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                {
-                    trace_printf("%s@%d - Sending Notice to TM=%s (%d,%d)\n", method_name, __LINE__, mytm->GetName(), mytm->GetNid(), mytm->GetPid());
-                    trace_printf("        %s %d handles\n", (type == MsgType_TmSyncCommit?" - Commit ":" - Aborted "), count);
-                }
-                notice = new struct message_def;
-                memmove( notice, msg, sizeof(message_def) );
-
-                SQ_theLocalIOToClient->putOnNoticeQueue( mytm->GetPid()
-                                                       , mytm->GetVerifier()
-                                                       , notice
-                                                       , NULL);
-            }
-        }
-    }
-
-    
-    if ( msg )
-    {
-        delete msg;
-    }
-
-    // Initialize Unsolicited message counters
-    TmSyncReplies = 0;
-    TmSyncReplyCode = MPI_SUCCESS;
-    ReqsInBlock = 0;
-    PendingSlaveTmSyncCount = 0;
-    TotalSlaveTmSyncCount = 0;
-    TRACE_EXIT;
-}
-
-void CTmSync_Container::EndPendingTmSync( struct sync_def *sync )
-{
-    CProcess           *mytm;
-    CTmSyncReq         *req = Head;
-    CTmSyncReq         *next;
-    CLNode             *lnode;
-    struct message_def *msg = NULL;
-    struct message_def *notice;
-    int                 count = 0;
-    int                 orig_count = 0;
-    char                la_buf[MON_STRING_BUF_SIZE];
-
-    const char method_name[] = "CTmSync_Container::EndPendingTmSync";
-    TRACE_ENTRY;
-
-    // send a abort notice to my node's TM
-    msg = new struct message_def;
-    msg->type = MsgType_TmSyncAbort;
-    msg->noreply = true;
-    msg->u.request.type = ReqType_Notice;
-    while (req)
-    {
-        next = req->GetNext();
-        if ( !req->Unsolicited )
-        {
-            // my node
-            if (trace_settings & TRACE_TMSYNC)
-            {
-                trace_printf("%s@%d - Original request (%p) nid=%d, handle=%d, tag=%d, unsol=%d, comp=%d\n", method_name, __LINE__, req, req->Nid, req->Handle, req->Tag, req->Unsolicited, req->Completed);
-            }
-            msg->u.request.u.tm_sync_notice.orig_tag[orig_count] = req->Tag;
-            msg->u.request.u.tm_sync_notice.orig_handle[orig_count] = req->Handle;
-            orig_count++;
-        }
-        if (( sync->syncnid == req->Nid ) &&
-            ( req->Replicated           ) &&
-            ( !req->Unsolicited         ) &&
-            ( req->Completed            )   )
-        {
-            if (trace_settings & TRACE_TMSYNC)
-            {
-                trace_printf("%s@%d - Request (%p) nid=%d, handle=%d, tag=%d, unsol=%d, comp=%d\n", method_name, __LINE__, req, req->Nid, req->Handle, req->Tag, req->Unsolicited, req->Completed);
-            }
-            msg->u.request.u.tm_sync_notice.nid[count] = req->Nid;
-            msg->u.request.u.tm_sync_notice.handle[count] = req->Handle;
-            count++;
-            if (trace_settings & TRACE_TMSYNC)
-            {
-                trace_printf("%s@%d - Count request (%p) nid=%d, handle=%d, tag=%d, count=%d\n", method_name, __LINE__, req, req->Nid, req->Handle, req->Tag, count);
-            }
-            req->DeLink(&Head,&Tail);
-            delete req;
-        }
-        req = next;
-    }
-    
-    if ( count )
-    {
-        msg->u.request.u.tm_sync_notice.count = count;
-        msg->u.request.u.tm_sync_notice.orig_count = orig_count;
-
-        lnode = MyNode->GetFirstLNode();
-        for ( ; lnode  ; lnode = lnode->GetNextP() )
-        {
-            if ( lnode->GetNid() == sync->syncnid &&
-                 lnode->GetState() == State_Up )
-            {
-                // the logical node is up and 
-                // is not the requesting logical node
-                mytm = lnode->GetProcessLByType( ProcessType_DTM );
-                if ( !mytm )
-                {
-                    sprintf(la_buf, "[%s], Can't find TM in node=%d\n", method_name, lnode->GetNid());
-                    mon_log_write(MON_TMSYNC_END_PENDING, SQ_LOG_ERR, la_buf); 
-    
-                    ReqQueue.enqueueDownReq(MyPNID);
-                }
-                else
-                {
-                    if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                    {
-                        trace_printf("%s@%d - Sending Notice to TM=%s (%d,%d)\n", method_name, __LINE__, mytm->GetName(), mytm->GetNid(), mytm->GetPid());
-                        trace_printf("      - Aborted %d handles\n", count);
-                    }
-                    notice = new struct message_def;
-                    memmove( notice, msg, sizeof(message_def) );
-                    SQ_theLocalIOToClient->putOnNoticeQueue( mytm->GetPid()
-                                                           , mytm->GetVerifier()
-                                                           , notice
-                                                           , NULL);
-                }
-            }
-        }
-    }
-
-    
-    if ( msg )
-    {
-        delete msg;
-    }
-
-    TRACE_EXIT;
-}
-
-void CTmSync_Container::ProcessTmSyncReply( struct message_def * msg )
-{
-    const char method_name[] = "CTmSync_Container::ProcessTmSyncReply";
-    TRACE_ENTRY;
-
-    CTmSyncReq *tmsync_req;
-
-    if (trace_settings & (TRACE_REQUEST | TRACE_TMSYNC))
-        trace_printf("%s@%d - Unsolicited TmSync Reply\n",
-                     method_name, __LINE__);
-    tmsync_req = FindTmSyncReq( msg->u.reply.u.unsolicited_tm_sync.handle );
-    if (tmsync_req)
-    {
-        if (trace_settings & (TRACE_REQUEST | TRACE_TMSYNC))
-            trace_printf("%s@%d - Unsolicited TmSync reply, handle=%d\n",
-                         method_name, __LINE__, tmsync_req->Handle);
-        if (msg->u.reply.u.unsolicited_tm_sync.return_code == MPI_SUCCESS)
-        {
-            TmSyncReplyCode |= msg->u.reply.u.unsolicited_tm_sync.return_code;
-            tmsync_req->Completed = true;
-            UnsolicitedComplete( msg );
-            if ( tmSyncPNid_ == MyPNID )
-            {
-                if (trace_settings & (TRACE_REQUEST | TRACE_TMSYNC))
-                    trace_printf("%s@%d - Local Unsolicited TmSync reply, handle="
-                                 "%d\n", method_name, __LINE__,
-                                 tmsync_req->Handle);
-                if ( GetTmSyncReplies() == GetTotalSlaveTmSyncCount() )
-                {
-                    UpdateTmSyncState( TmSyncReplyCode );
-                    UnsolicitedCompleteDone();
-                }
-            }
-            else
-            {
-                if ( GetTmSyncReplies() == GetTotalSlaveTmSyncCount() )
-                {
-                    CommitTmDataBlock(TmSyncReplyCode);
-                }
-            }
-        }
-        else
-        { // The Seabed callback has not been registered, try again
-            if (trace_settings & (TRACE_REQUEST | TRACE_TMSYNC))
-                trace_printf("%s@%d - Retrying Local Unsolicited TmSync, handle="
-                             "%d\n", method_name, __LINE__,
-                             tmsync_req->Handle);
-            PendingSlaveTmSyncCount--;
-            tmsync_req->Completed = false;
-            SendUnsolicitedMessages();
-        }
-    }
-    else
-    {
-        if (trace_settings & (TRACE_REQUEST | TRACE_TMSYNC))
-            trace_printf("%s@%d" " - Can't find TmSync request, handle="  "%d" "\n", method_name, __LINE__, msg->u.reply.u.unsolicited_tm_sync.handle);
-    }
-    msg->noreply = true;
-
-    if (trace_settings & TRACE_TMSYNC)
-       trace_printf("%s@%d" " - Unsolicited TmSync notices, total=%d, replies=%d, pending=%d\n", method_name, __LINE__, GetTotalSlaveTmSyncCount(), GetTmSyncReplies(), GetPendingSlaveTmSyncCount() );
-
-    TRACE_EXIT;
-}
-
-void CTmSync_Container::ExchangeTmSyncState( bool bumpSync )
-{
-    struct sync_def sync;
-
-    const char method_name[] = "CTmSync_Container::ExchangeTmSyncState";
-    TRACE_ENTRY;
-
-    sync.type = SyncType_TmSyncState;
-    sync.pnid = MyPNID;
-    sync.syncnid = MyNode->GetTmSyncNid();
-    sync.state = MyNode->GetTmSyncState();
-    sync.count = 0;
-    sync.length = 0;
-    syncCycle_.lock();
-    exchangeTmSyncData( &sync, bumpSync );
-    syncCycle_.unlock();
-
-    TRACE_EXIT;
-}
-
-CTmSyncReq *CTmSync_Container::FindTmSyncReq( int handle )
-{
-    CTmSyncReq *req = Head;
-
-    const char method_name[] = "CTmSync_Container::FindTmSyncReq";
-    TRACE_ENTRY;
-    while (req)
-    {
-        if( req->Handle == handle && req->Unsolicited)
-        {
-            if (trace_settings & TRACE_TMSYNC)
-            {
-                trace_printf("%s@%d - request (%p) nid=%d, handle=%d, tag=%d, unsol=%d, comp=%d\n", method_name, __LINE__, req, req->Nid, req->Handle, req->Tag, req->Unsolicited, req->Completed);
-            }
-            break;
-        }
-        req = req->GetNext();
-    }
-    TRACE_EXIT;
-
-    return req;
-}
-
-int CTmSync_Container::GetHandle( void )
-{
-    const char method_name[] = "CTmSync_Container::GetHandle";
-    TRACE_ENTRY;
-    if ( HandleSeq >= (MyPNID+1)*MAX_TM_HANDLES ) 
-    {
-        HandleSeq = MyPNID*MAX_TM_HANDLES;
-    }
-    TRACE_EXIT;
-    return ++HandleSeq;
-}
-
-struct sync_def *CTmSync_Container::PackSyncData( void )
-{
-    char *ptr;
-    CTmSyncReq *req = Head;
-    struct sync_def *sync;
-
-    const char method_name[] = "CTmSync_Container::PackSyncData";
-    TRACE_ENTRY;
-
-    sync = new struct sync_def;
-    sync->type = SyncType_TmData;
-    sync->pnid = MyPNID;
-    sync->syncnid = -1;
-    sync->state = SyncState_Start;
-    sync->count = 0;
-    sync->length = 0;
-    ptr = sync->data;
-    while ( req )
-    {
-        if (( !req->Unsolicited                                         ) &&
-            ( !req->Replicated                                          ) &&
-            (  sync->length+req->Length+(sizeof(int)*3) < MAX_SYNC_DATA ) && 
-            (  sync->count                              < MAX_TM_SYNCS  )   )
-        {
-            if ( sync->syncnid == -1 )
-            {
-                // The first pending request's logical node wins
-                sync->syncnid = req->Nid;
-            }
-            if ( sync->syncnid == req->Nid )
-            {
-                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                   trace_printf("%s@%d" " - Packing TmSync request, nid=%d, handle="  "%d" "\n", method_name, __LINE__, req->Nid, req->Handle);
-                // load request
-                sync->count++;
-                *((int*)ptr) = req->Nid;
-                ptr += sizeof(int);
-                *((int*)ptr) = req->Handle;
-                ptr += sizeof(int);
-                *((int*)ptr) = req->Length;
-                ptr += sizeof(int);
-                memmove( ptr, req->Data, req->Length );
-                ptr += req->Length;
-                req->Replicated = true;
-                sync->length += ((sizeof(int)*3)+req->Length);
-            }
-            else
-            {
-                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                   trace_printf("%s@%d" " - NOT Packing TmSync request, nid=%d, handle="  "%d" "\n", method_name, __LINE__, req->Nid, req->Handle);
-            }
-        }
-        req = req->GetNext();
-    }
-    TRACE_EXIT;
-
-    return sync;
-}
-
-void CTmSync_Container::ReQueue_TmSync( bool master )
-{
-    int collisionNid = -1;
-    MyNode->SetTmSyncNid( -1 );
-    CTmSyncReq *req = Head;
-    CTmSyncReq *next;
-
-    const char method_name[] = "CTmSync_Container::ReQueue_TmSync";
-    TRACE_ENTRY; 
-    while (req)
-    {
-        if (trace_settings & TRACE_TMSYNC)
-        {
-            trace_printf("%s@%d - request (%p) nid=%d, handle=%d, tag=%d, unsol=%d, comp=%d\n", method_name, __LINE__, req, req->Nid, req->Handle, req->Tag, req->Unsolicited, req->Completed);
-        }
-        next = req->GetNext();
-        if ( req->Replicated )
-        {
-            if ( master && !req->Unsolicited )
-            {
-                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                   trace_printf("%s@%d" " - Collision, resetting handle="  "%d" "\n", method_name, __LINE__, req->Handle);
-                req->Replicated = false;
-                collisionNid = req->Nid;
-            }
-            if ( master && req->Unsolicited && req->Nid == collisionNid ) 
-            { 
-                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                   trace_printf("%s@%d" " - Collision, deleting unsolicited handle="  "%d" "\n", method_name, __LINE__, req->Handle);
-                req->DeLink(&Head,&Tail);
-                delete req;
-            }
-            if ( !master && req->Unsolicited ) 
-            { 
-                if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-                   trace_printf("%s@%d" " - Collision, deleting unsolicited handle="  "%d" "\n", method_name, __LINE__, req->Handle);
-                req->DeLink(&Head,&Tail);
-                delete req;
-            }
-        }
-        req = next;
-    }
-    TRACE_EXIT; 
-}
-
-CTmSyncReq *CTmSync_Container::Q_TmSync(int nid, int handle, char *data, int len, int tag, bool unsolicited)
-{
-    CTmSyncReq *req = NULL;
-    char        la_buf[MON_STRING_BUF_SIZE];
-    
-    const char method_name[] = "CTmSync_Container::Q_TmSync";
-    TRACE_ENTRY;
-
-    if ( len > MAX_SYNC_DATA )
-    {
-        sprintf(la_buf, "[%s], Tm Sync length greater than max, len=%d. \n", method_name, len);
-        mon_log_write(MON_TMSYNC_Q_TMSYNC, SQ_LOG_ERR, la_buf);
-    }
-    else
-    {
-        req = new CTmSyncReq(nid, handle, data, len, tag, unsolicited);
-        if (Head == NULL)
-        {
-            Head = Tail = req;
-        }
-        else
-        {
-            Tail = Tail->Link (req);
-        }
-    }
-
-    TRACE_EXIT;
-    return req;
-}
-
-void CTmSync_Container::SendUnsolicitedMessages (void)
-{
-    int                 numTMs = 0;
-    CTmSyncReq         *req = Head;
-    CProcess           *mytm;
-    CProcess           *tm = NULL;
-    CLNode             *lnode;
-    struct message_def *msg = NULL;
-    struct message_def *notice;
-    char                la_buf[MON_STRING_BUF_SIZE];
-
-    const char method_name[] = "CTmSync_Container::SendUnsolicitedMessages";
-    TRACE_ENTRY;
-
-    while (req)
-    {
-        if (trace_settings & TRACE_TMSYNC)
-        {
-            trace_printf("%s@%d - request (%p) nid=%d, handle=%d, tag=%d, unsol=%d, comp=%d\n", method_name, __LINE__, req, req->Nid, req->Handle, req->Tag, req->Unsolicited, req->Completed);
-        }
-        if ( req->Unsolicited && !req->Completed )
-        { 
-            if (!tm || req->Nid != tm->GetNid())
-            { 
-                // Get the TM that initiated the sync request
-                tm = LNode[req->Nid]->GetProcessLByType( ProcessType_DTM );
-            }
-            if (!tm && NameServerEnabled)
-            {
-                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
-                {
-                    trace_printf( "%s@%d - Getting process from Name Server, nid=%d, type=ProcessType_DTM\n"
-                                , method_name, __LINE__, req->Nid );
-                }
-            
-                tm = Nodes->GetProcessLByTypeNs( req->Nid, ProcessType_DTM );
-            }
-            if ( tm )
-            {
-                // send all TmSync requests data to the local TM processes
-                msg = new struct message_def;
-                msg->type = MsgType_UnsolicitedMessage;
-                msg->noreply = false;
-                msg->u.request.type = ReqType_TmSync;
-                msg->u.request.u.unsolicited_tm_sync.nid = tm->GetNid();
-                msg->u.request.u.unsolicited_tm_sync.pid = tm->GetPid();
-                msg->u.request.u.unsolicited_tm_sync.handle = req->Handle;
-                memmove( msg->u.request.u.unsolicited_tm_sync.data, req->Data, req->Length );
-                msg->u.request.u.unsolicited_tm_sync.length = req->Length;
-
-                // count the number of candidate TMs
-                if ( numTMs == 0 )
-                {
-                    lnode = MyNode->GetFirstLNode();
-                    for ( ; lnode  ; lnode = lnode->GetNextP() )
-                    {
-                        if ( lnode->GetNid() != req->Nid )
-                        {
-                            numTMs++;
-                        }
-                    }
-                    // calculate the total number of TmSync request 
-                    TotalSlaveTmSyncCount = ReqsInBlock * numTMs;
-                }
-
-                if (trace_settings & TRACE_TMSYNC)
-                   trace_printf("%s@%d" " - Unsolicited TmSync notices, total=%d, replies=%d, pending=%d\n", method_name, __LINE__, GetTotalSlaveTmSyncCount(), GetTmSyncReplies(), GetPendingSlaveTmSyncCount() );
-                   
-                lnode = MyNode->GetFirstLNode();
-                for ( ; lnode  ; lnode = lnode->GetNextP() )
-                {
-                    if ( lnode->GetState() == State_Up && 
-                         lnode->GetNid()   != req->Nid      )
-                    {
-                        // the logical node is up and 
-                        // is not the requesting logical node
-                        mytm = lnode->GetProcessLByType( ProcessType_DTM );
-                        if ( !mytm )
-                        {
-                            sprintf(la_buf, "[%s], Can't find TM in node=%d\n", method_name, lnode->GetNid());
-                            mon_log_write(MON_TMSYNC_SEND_UNSOLICITED, SQ_LOG_ERR, la_buf); 
-                
-                            ReqQueue.enqueueDownReq(MyPNID);
-                        }
-                        else
-                        {
-                            if (trace_settings & TRACE_TMSYNC)
-                            {
-                                trace_printf("%s@%d - Sending Unsolicited TmSync to TM=%s (%d,%d)\n", method_name, __LINE__, mytm->GetName(), mytm->GetNid(), mytm->GetPid());
-                                trace_printf("        from TM=%s (%d,%d)\n", tm->GetName(), tm->GetNid(), tm->GetPid());
-                                trace_printf("        tag=%d\n", req->Tag);
-                                trace_printf("        handle=%d\n", req->Handle);
-                            }
-                            notice = new struct message_def;
-                            memmove( notice, msg, sizeof(message_def) );
-                            SQ_theLocalIOToClient->putOnNoticeQueue( mytm->GetPid()
-                                                                   , mytm->GetVerifier()
-                                                                   , notice
-                                                                   , NULL);
-                            mytm->IncrUnsolTmSyncCount();
-                            PendingSlaveTmSyncCount++;
-                        }
-                    }
-                }
-                if ( msg )
-                {
-                    delete msg;
-                    msg = NULL;
-                }
-                if (NameServerEnabled)
-                {
-                    if (!MyNode->IsMyNode( tm->GetNid() )
-                      && (req->GetNext() && req->GetNext()->Nid != tm->GetNid() ) )
-                    {
-                        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
-                        {
-                            trace_printf( "%s@%d - Deleting clone process %s, (%d,%d:%d)\n"
-                                        , method_name, __LINE__
-                                        , tm->GetName()
-                                        , tm->GetNid()
-                                        , tm->GetPid()
-                                        , tm->GetVerifier() );
-                        }
-                        Nodes->DeleteCloneProcess( tm );
-                        tm = NULL;
-                    }
-                
-                }
-            }
-            else
-            {
-                sprintf(la_buf, "[%s], Can't find requesting TM for nid= %d.\n", method_name, req->Nid);
-                mon_log_write(MON_TMSYNC_UNPACKSYNCDATA, SQ_LOG_ERR, la_buf); 
-                CNode *node = LNode[req->Nid]->GetNode();
-                ReqQueue.enqueueDownReq( node->GetPNid() );
-            }
-        }
-        req = req->GetNext();
-    }
-
-    if (trace_settings & TRACE_TMSYNC)
-       trace_printf("%s@%d" " - Unsolicited TmSync notices, total=%d, replies=%d, pending=%d\n", method_name, __LINE__, GetTotalSlaveTmSyncCount(), GetTmSyncReplies(), GetPendingSlaveTmSyncCount() );
-       
-    PendingSlaveTmSync = false;
-
-    TRACE_EXIT;
-}
-
-void CTmSync_Container::TmSync( void )
-{
-    int              rc;
-    struct sync_def *block;
-
-    const char method_name[] = "CTmSync_Container::TmSync";
-    TRACE_ENTRY;
-
-    block = PackSyncData();
-    if ( block->count )
-    {
-        if (trace_settings & TRACE_TMSYNC)
-           trace_printf("%s@%d" " - Processing TmSync request" "\n", method_name, __LINE__);
-        rc = CoordinateTmDataBlock( block );
-        if ( rc != MPI_SUCCESS )
-        {
-            ReQueue_TmSync (true);
-            if (trace_settings & TRACE_TMSYNC)
-               trace_printf("%s@%d" " - Collision, no requests processed" "\n", method_name, __LINE__);
-        }
-    }
-
-    if (trace_settings & TRACE_TMSYNC)
-       trace_printf("%s@%d" " - PendingTmSync=%d, total=%d, replies=%d, pending=%d\n", method_name, __LINE__, PendingSlaveTmSync, GetTotalSlaveTmSyncCount(), GetTmSyncReplies(), GetPendingSlaveTmSyncCount() );
-
-    delete block;
-
-    TRACE_EXIT;
-}
-
-void CTmSync_Container::TmSyncAbortPending( void )
-{
-    bool             notDone = true;
-    struct sync_def *block = NULL;
-
-    const char method_name[] = "CTmSync_Container::TmSyncAbortPending";
-    TRACE_ENTRY;
-
-    do
-    {
-        block = PackSyncData();
-        if ( block->count )
-        {
-            if (trace_settings & TRACE_TMSYNC)
-               trace_printf("%s@%d" " - Aborting pending TmSync\n", method_name, __LINE__);
-            EndPendingTmSync( block );
-        }
-        else
-        {
-            notDone = false;
-        }
-
-        if (trace_settings & TRACE_TMSYNC)
-           trace_printf("%s@%d" " - PendingTmSync=%d, total=%d, replies=%d, pending=%d\n", method_name, __LINE__, PendingSlaveTmSync, GetTotalSlaveTmSyncCount(), GetTmSyncReplies(), GetPendingSlaveTmSyncCount() );
-
-        delete block;
-    }
-    while( notDone );
-
-    AbortPendingTmSync = false;
-
-    TRACE_EXIT;
-}
-
-bool CTmSync_Container::TmSyncPending( void )
-{
-    bool rc = false;
-    bool ready = true;
-    int pnid = 0;
-    CNode *node;
-    CTmSyncReq *req;
-
-    const char method_name[] = "CTmSync_Container::TmSyncPending";
-    TRACE_ENTRY;
-    if ( MyNode->GetState() == State_Down )
-    {
-        // For Virtual nodes: if we are down ... 
-        // just return and continue processing like normal
-        return false;
-    }
-    if (trace_settings & TRACE_TMSYNC)
-       trace_printf("%s@%d" " - PendingTmSync=%d, total=%d, replies=%d, pending=%d\n", method_name, __LINE__, PendingSlaveTmSync, GetTotalSlaveTmSyncCount(), GetTmSyncReplies(), GetPendingSlaveTmSyncCount() );
-
-    if (( MyNode->GetTmSyncState() == SyncState_Abort ) &&
-        ( tmSyncPNid_ != MyPNID ) &&
-        ( GetTmSyncReplies() == GetTotalSlaveTmSyncCount() )   )
-    {
-        CommitTmDataBlock( MPI_ERR_UNKNOWN );
-    }
-    else
-    {
-        if ( ! MyNode->IsSpareNode() && MyNode->GetPhase() != Phase_Ready )
-        {
-            MyNode->CheckActivationPhase();
-        }
-        if ( Nodes->GetTmState( SyncState_Suspended ) == SyncState_Suspended )
-        {
-            if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-               trace_printf("%s@%d" " - TmSync suspended" "\n", method_name, __LINE__);
-            return false;
-        }
-        if (trace_settings & TRACE_TMSYNC)
-           trace_printf("%s@%d" " - PendingTmSync=%d, total=%d, replies=%d, pending=%d\n", method_name, __LINE__, PendingSlaveTmSync, GetTotalSlaveTmSyncCount(), GetTmSyncReplies(), GetPendingSlaveTmSyncCount() );
-        if ( PendingSlaveTmSync )
-        {
-            if (trace_settings & TRACE_TMSYNC)
-               trace_printf("%s@%d" " - Pending TmSync, total=%d, replies=%d, pending=%d\n", method_name, __LINE__, GetTotalSlaveTmSyncCount(), GetTmSyncReplies(), GetPendingSlaveTmSyncCount() );
-            SendUnsolicitedMessages();
-            return false;
-        }
-        if ( TmSyncReplies != GetTotalSlaveTmSyncCount() )
-        {
-            if (trace_settings & TRACE_TMSYNC)
-               trace_printf("%s@%d" " - TmSync waiting for TmSync replies, total=%d, replies=%d, pending=%d\n", method_name, __LINE__, GetTotalSlaveTmSyncCount(), GetTmSyncReplies(), GetPendingSlaveTmSyncCount() );
-            return false;
-        }
-    }
-
-    // if no one is trying to tmsync and we have something ... then go for it
-    node = Nodes->GetNode(pnid);
-    while ( node )
-    {
-        if (( node->GetState() == State_Up && ! node->IsSpareNode() ) &&
-            ( node->GetTmSyncState() != SyncState_Null )   )
-        {
-            if (trace_settings & TRACE_TMSYNC)
-               trace_printf("%s@%d" " - TmSync needed, but not ready. TmSyncState("  "%d" ")" "\n", method_name, __LINE__, node->GetTmSyncState());
-            ready = false;
-            break;
-        }
-        node = Nodes->GetNode(++pnid);
-    }
-
-    if ( ready )
-    {
-        req = Head;
-        while ( req )
-        {
-            if ( !req->Replicated )
-            {
-                if (trace_settings & TRACE_TMSYNC)
-                   trace_printf("%s@%d" " - TmSync needed" "\n", method_name, __LINE__);
-                rc = true;
-                break;
-            }
-            req = req->GetNext();
-        }
-    }
-
-    TRACE_EXIT;
-    return rc;
-}
-
-void CTmSync_Container::UnPackSyncData(struct sync_def *sync)
-{
-    char               *ptr;
-    int                 nid;
-    int                 handle;
-    int                 length;
-    CTmSyncReq         *req;
-
-    const char method_name[] = "CTmSync_Container::UnPackSyncData";
-    TRACE_ENTRY;
-
-    if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
-       trace_printf("%s@%d" " - UnPacking TmSync request(s)" "\n", method_name, __LINE__);
-
-    // Initialize Unsolicited message counters
-    TmSyncReplies = 0;
-    TotalSlaveTmSyncCount = 0;
-    TmSyncReplyCode = MPI_SUCCESS;
-    ReqsInBlock = sync->count;
-
-    // send all TmSync request data to TM process
-    ptr = sync->data;
-    while (sync->count)
-    {
-        // load request
-        nid = *((int*)ptr);
-        ptr += sizeof(int);
-        handle = *((int*)ptr);
-        ptr += sizeof(int);
-        length = *((int*)ptr);
-        ptr += sizeof(int);
-
-        // save another nodes TmSync request in our nodes' request queue,
-        // but mark it as having been replicated
-        req = Q_TmSync(nid, handle, ptr, length, -1, true);
-        req->Replicated = true;
-        PendingSlaveTmSync = true;
-
-        ptr += length;
-        sync->count--;
-    }
-    TRACE_EXIT;
-}
-
-void CTmSync_Container::CancelUnsolicitedMessages( CProcess *tmProcess )
-{
-    CTmSyncReq *req = Head;
-    const char method_name[] = "CTmSync_Container::CancelUnsolicitedMessages";
-    TRACE_ENTRY;
-
-    while (req)
-    {
-        if (trace_settings & TRACE_TMSYNC)
-        {
-            trace_printf("%s@%d - request (%p) nid=%d, handle=%d, tag=%d, unsol=%d, comp=%d\n", method_name, __LINE__, req, req->Nid, req->Handle, req->Tag, req->Unsolicited, req->Completed);
-        }
-        if ( req->Unsolicited && !req->Completed )
-        {
-            // Count it as a completed reply
-            IncrTmSyncReplies();
-            tmProcess->DecrUnsolTmSyncCount();
-            if ( trace_settings & TRACE_TMSYNC )
-            {
-                trace_printf("%s@%d - Canceling Unsolicited TmSync to TM=%s (%d,%d)\n", method_name, __LINE__, tmProcess->GetName(), tmProcess->GetNid(), tmProcess->GetPid());
-                trace_printf("        tag=%d\n", req->Tag);
-                trace_printf("        handle=%d\n", req->Handle);
-                trace_printf("%s@%d - TM=%s (%d,%d), TmSync count=%d  \n", method_name, __LINE__, tmProcess->GetName(), tmProcess->GetNid(), tmProcess->GetPid(), tmProcess->GetUnsolTmSyncCount());
-            }
-
-            int rc = sem_post( &UnsolicitedWaitSem );
-            if ( rc && errno != EINTR)
-            {
-                int err = errno;
-                char la_buf[MON_STRING_BUF_SIZE];
-                sprintf(la_buf, "[%s], Can't post on unnamed semaphore! - errno=%d (%s)\n", method_name, err, strerror(errno));
-                mon_log_write(MON_TMSYNC_CANCEL_UNSOL_MESSAGE_1, SQ_LOG_ERR, la_buf);
-            }
-        
-            break;
-        }
-        req = req->GetNext();
-    }
-
-    TRACE_EXIT;
-}
-
-void CTmSync_Container::UnsolicitedComplete( struct message_def *msg )
-{
-    const char method_name[] = "CTmSync_Container::UnsolicitedComplete";
-    TRACE_ENTRY;
-
-    CLNode *lnode = MyNode->GetLNode( msg->u.reply.u.unsolicited_tm_sync.nid );
-    assert( lnode );
-
-    CProcess *mytm = lnode->GetProcessLByType( ProcessType_DTM );
-    if ( !mytm )
-    {
-        char la_buf[MON_STRING_BUF_SIZE];
-        sprintf(la_buf, "[%s], Can't find TM in node nid=%d\n", method_name, lnode->GetNid());
-        mon_log_write(MON_TMSYNC_UNSOL_COMPLETE_1, SQ_LOG_ERR, la_buf); 
-
-        ReqQueue.enqueueDownReq(MyPNID);
-    }
-    else
-    {
-        IncrTmSyncReplies();
-        mytm->DecrUnsolTmSyncCount();
-        if ( trace_settings & TRACE_TMSYNC )
-        {
-            trace_printf("%s@%d - TmSync reply from TM=%s (%d,%d), TmSync count=%d  \n", method_name, __LINE__, mytm->GetName(), mytm->GetNid(), mytm->GetPid(), mytm->GetUnsolTmSyncCount());
-        }
-    }
-
-    TRACE_EXIT;
-}
-
-void CTmSync_Container::UnsolicitedCompleteDone( void )
-{
-    const char method_name[] = "CTmSync_Container::UnsolicitedCompleteDone";
-    TRACE_ENTRY;
-
-    int rc = sem_post( &UnsolicitedWaitSem );
-    if ( rc && errno != EINTR)
-    {
-        int err = errno;
-        char la_buf[MON_STRING_BUF_SIZE];
-        sprintf(la_buf, "[%s], Can't post on unnamed semaphore! - errno=%d (%s)\n", method_name, err, strerror(errno));
-        mon_log_write(MON_TMSYNC_UNSOL_COMPLETE_2, SQ_LOG_ERR, la_buf);
-    }
-
-    TRACE_EXIT;
-}
-
-void CTmSync_Container::UnsolicitedCompleteWait( void )
-{
-    const char method_name[] = "CTmSync_Container::UnsolicitedCompleteWait";
-    TRACE_ENTRY;
-
-    int rc;
-    bool waitSomeMore = true;
-
-    do
-    {
-        rc = sem_wait( &UnsolicitedWaitSem );
-        if ( rc )
-        {
-            if ( errno != EINTR )
-            {
-                int err = errno;
-                char la_buf[MON_STRING_BUF_SIZE];
-                sprintf(la_buf, "[%s], Can't wait on unnamed semaphore! - errno=%d (%s)\n", method_name, err, strerror(errno));
-                mon_log_write(MON_TMSYNC_UNSOL_COMPLETE_WAIT_1, SQ_LOG_ERR, la_buf);
-                waitSomeMore = false;
-            }
-        }
-        else
-        {
-            waitSomeMore = false;
-        }
-    }
-    while( waitSomeMore );
-
-    TRACE_EXIT;
-}
diff --git a/core/sqf/monitor/linux/tmsync.h b/core/sqf/monitor/linux/tmsync.h
deleted file mode 100644
index 9d0aaf4..0000000
--- a/core/sqf/monitor/linux/tmsync.h
+++ /dev/null
@@ -1,147 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////
-//
-// @@@ START COPYRIGHT @@@
-//
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// @@@ END COPYRIGHT @@@
-//
-///////////////////////////////////////////////////////////////////////////////
-
-#ifndef TMSYNC_H_
-#define TMSYNC_H_
-
-#include <semaphore.h>
-
-#include "lock.h"
-#include "msgdef.h"
-#include "cluster.h"
-#include "process.h"
-
-class CTmSyncReq
-{
- private:
-    int            eyecatcher_;      // Debuggging aid -- leave as first
-                                     // member variable of the class
-public:
-    int    Nid;
-    int    Tag;
-    int    Handle;
-    int    Length;
-    char  *Data;
-    bool   Unsolicited; // request is from another node
-    bool   Replicated;  // this request has been replicated across nodes
-    bool   Completed;   // unsolicited request has accepted by TM
-
-    CTmSyncReq( int nid, int handle, char *data, int length, int tag, bool unsolicited );
-    ~CTmSyncReq( void );
-    void DeLink( CTmSyncReq **head, CTmSyncReq **tail );
-    CTmSyncReq *GetNext( void );
-    CTmSyncReq *Link( CTmSyncReq *entry );
-
-
-protected:
-private:
-    CTmSyncReq *Next;
-    CTmSyncReq *Prev;
-};
-
-class CTmSync_Container : public CCluster
-{
-public:
-    int TmSyncReplies;     // # of unsolicited replies received for current block
-    int ReqsInBlock;       // # of requests currently being processed
-    int TmSyncReplyCode;   // last unsuccessful unsolicited reply code for block
-    int PendingSlaveTmSyncCount; // number of unsolicited messages sent to TMs for slave sync
-
-    CTmSync_Container( void );
-    virtual ~CTmSync_Container( void );
-
-    void CancelUnsolicitedMessages( CProcess *tmProcess );
-    int  GetHandle( void );         // return the next handle id to use
-    inline bool GetPendingSlaveTmSync( void ) { return( PendingSlaveTmSync ); }
-    int  GetTotalSlaveTmSyncCount( void )
-    {
-        CAutoLock lock(TmSyncReplyLock.getLocker());
-        return( TotalSlaveTmSyncCount );
-    }
-    inline bool IsAbortPendingTmSync( void ) { return( AbortPendingTmSync ); }
-    void ProcessTmSyncReply( struct message_def * msg );
-    int  CoordinateTmDataBlock( struct sync_def *block );
-    CTmSyncReq *FindTmSyncReq( int handle );
-    CTmSyncReq *Q_TmSync( int nid, int handle, char *data, int len, int tag, bool unsolicited );
-    void ReQueue_TmSync( bool master ); // return packed TmSyncReqQ entries to the queue
-    inline void SetAbortPendingTmSync( void ) { AbortPendingTmSync = true; }
-    void TmSync( void );            // Coordinate TmSync across nodes using Im'Alive()
-    void TmSyncAbortPending( void );   // Abort all local unprocessed TmSyncs
-    bool TmSyncPending( void );
-
-protected:
-    void CommitTmDataBlock( int return_code );
-    void IncrTmSyncReplies( void )
-    {
-        TmSyncReplyLock.lock();
-        TmSyncReplies++;
-        TmSyncReplyLock.unlock();
-    }
-    int DecrTmSyncReplies( void )
-    {
-        TmSyncReplyLock.lock();
-        TmSyncReplies--;
-        TmSyncReplyLock.unlock();
-        return( TmSyncReplies );
-    }
-    int  GetPendingSlaveTmSyncCount( void )
-    {
-        CAutoLock lock(TmSyncReplyLock.getLocker());
-        return( PendingSlaveTmSyncCount );
-    }
-    int  GetReqsInBlock( void )
-    {
-        return( ReqsInBlock );
-    }
-    int  GetTmSyncReplies( void )
-    {
-        CAutoLock lock(TmSyncReplyLock.getLocker());
-        return( TmSyncReplies );
-    }
-    void UpdateTmSyncState( int return_code );
-    void UnsolicitedComplete( struct message_def *msg );
-    void UnsolicitedCompleteDone( void );
-    void UnsolicitedCompleteWait( void );
-
-private:
-    sem_t       UnsolicitedWaitSem; // Unsolicited message completion semaphore
-    CLock       TmSyncReplyLock;    // number of unsolicited messages sent to TMs for slave sync
-    CTmSyncReq *Head;               // Pointer to 1st TmSync request needing resolution
-    CTmSyncReq *Tail;               // Pointer to last TmSync request needing resolution
-    int         HandleSeq;
-    bool        PendingSlaveTmSync; // true when we need to send unsolicited messages to TM for slave sync
-    int         TotalSlaveTmSyncCount; // number of unsolicited messages to be sent to TMs for slave sync
-                                       // depending on the number of non-master TM in physical node
-    bool        AbortPendingTmSync; // set true when node down or activating spare node is triggered
-
-    void EndTmSync( MSGTYPE type );
-    void EndPendingTmSync( struct sync_def *sync );
-    void ExchangeTmSyncState( bool bumpSync );
-    struct sync_def *PackSyncData( void );        // return sync_def for current TmSyncReqQ entries
-    void SendUnsolicitedMessages( void );
-    void UnPackSyncData( struct sync_def *sync ); // process sync_def received from another node
-};
-
-#endif /*TMSYNC_H_*/
diff --git a/core/sqf/monitor/linux/watchdog.cxx b/core/sqf/monitor/linux/watchdog.cxx
index d0eabb8..fedb591 100644
--- a/core/sqf/monitor/linux/watchdog.cxx
+++ b/core/sqf/monitor/linux/watchdog.cxx
@@ -503,14 +503,6 @@
             Watchdog->CLock::wakeOne();
             break;
 
-        case MsgType_UnsolicitedMessage:
-            if (trace_settings & TRACE_REQUEST)
-            {
-                trace_printf( "%s@%d CB Unsolicited Message Received!\n",
-                              method_name, __LINE__ );
-            }
-            break;
-
         default:
             if (trace_settings & TRACE_REQUEST)
             {
@@ -539,7 +531,7 @@
                 char buf[MON_STRING_BUF_SIZE];
                 sprintf(buf, "[%s - InitLocalIO], Error= Failed to load cluster configuration!\n", MyName);
                 monproc_log_write(MON_WATCHDOG_INITLOCALIO_1, SQ_LOG_ERR, buf);
-                abort();
+                exit(EXIT_FAILURE);
             }
         }
         else
@@ -553,7 +545,7 @@
                 monproc_log_write(MON_WATCHDOG_INITLOCALIO_3, SQ_LOG_ERR, buf);
                 MyNid = 0;
             }
-            abort();
+            exit(EXIT_FAILURE);
         }
 
         lnodeConfig = ClusterConfig.GetLNodeConfig( MyNid );
@@ -605,7 +597,7 @@
 {
     // Determine trace file name
     const char *tmpDir;
-    tmpDir = getenv( "MPI_TMPDIR" );
+    tmpDir = getenv( "TRAF_LOG" );
         
     const char *envVar;
     envVar = getenv("WDT_TRACE_FILE");
@@ -766,13 +758,6 @@
     MyPid = atoi (argv[4]);
     gv_ms_su_verif  = MyVerifier = atoi(argv[9]);
 
-    // Set flag to indicate whether we are operating in a real cluster
-    // or a virtual cluster.
-    if ( getenv("SQ_VIRTUAL_NODES") )
-    {
-        IsRealCluster = false;
-    }
-
     MonLog = new CMonLog( "log4cxx.monitor.wdg.config", "WDG", "alt.wdg", MyPNID, MyNid, MyPid, MyName  );
 
     Watchdog = new CWatchdog();
diff --git a/core/sqf/monitor/linux/wdtimer.cxx b/core/sqf/monitor/linux/wdtimer.cxx
index 04b6bab..88f3efb 100644
--- a/core/sqf/monitor/linux/wdtimer.cxx
+++ b/core/sqf/monitor/linux/wdtimer.cxx
@@ -65,13 +65,12 @@
 #define WDIOC_SQ_GETTIMEOUT        _IOR(WATCHDOG_IOCTL_BASE, 7, int)
 
 
-// The following defines specify the default values for the timers if the timer related variables are not defined.
-// The first is a timeout value to use during startup initialization.  The SETTIMEOUT facility
-// only sets the timeout, but does not drive the lower level driver to use that value until
+// The WDT_STARTUPTIMERDEFAULT is a timeout value to use during startup 
+// initialization.  The SETTIMEOUT facility only sets the timeout, but
+// does not drive the lower level driver to use that value until
 // a KEEPALIVE call.
-// Defaults give 30 second and 10 second timer expiration respectively.
-#define WDT_StartupTimerDefault 30
-#define WDT_KeepAliveTimerDefault 10
+// Default value is in seconds
+#define WDT_STARTUPTIMERDEFAULT 30
 
 
 CWdTimer::CWdTimer()
@@ -81,7 +80,7 @@
          ,watchdog_(false)
          ,wdtRefresh_(Wdt_Disabled)
          ,wdtFd_(-1)
-         ,wdtKeepAliveTimerValue_(WDT_KeepAliveTimerDefault)
+         ,wdtKeepAliveTimerValue_(WDT_KEEPALIVETIMERDEFAULT)
 {
     const char method_name[] = "CWdTimer::CWdTimer";
     TRACE_ENTRY;
@@ -163,7 +162,7 @@
                 int err = errno;
                 sprintf(la_buf, "[CWdTimer::ResetWatchdogTimer], Keep alive failed. (Error: %s)\n", strerror(err));
                 monproc_log_write(MON_WDTIMER_RESET_WATCHTIMER, SQ_LOG_ERR, la_buf);
-                abort();
+                exit(EXIT_FAILURE);
             }
             gettimeofday(&wdTimerStart_, NULL);
         }
@@ -194,7 +193,7 @@
             err = errno;
             sprintf(la_buf, "[CWdTimer::RestoreWatchdogTimer], Set timeout failed. (Error: %s)\n", strerror(err));
             monproc_log_write(MON_WDTIMER_RESTORE_WATCHTIMER_1, SQ_LOG_ERR, la_buf);
-            abort();
+            exit(EXIT_FAILURE);
         }
 
         if (ioctl(wdtFd_, WDIOC_SQ_KEEPALIVE, &arg) == -1)
@@ -202,7 +201,7 @@
             err = errno;
             sprintf(la_buf, "[CWdTimer::RestoreWatchdogTimer], Keep alive failed. (Error: %s)\n", strerror(err));
             monproc_log_write(MON_WDTIMER_RESTORE_WATCHTIMER_2, SQ_LOG_ERR, la_buf);
-            abort();
+            exit(EXIT_FAILURE);
         }
         if (trace_settings & TRACE_INIT)
            trace_printf("%s@%d" " - Watchdog Timer set to %d" "\n", method_name, __LINE__, timer);
@@ -245,7 +244,7 @@
                 err = errno;
                 sprintf(la_buf, "[CWdTimer::SetWatchdogTimerMin], Set timeout failed. (Error: %s)\n", strerror(err));
                 monproc_log_write(MON_WDTIMER_SETMIN_WATCHTIMER_1, SQ_LOG_ERR, la_buf);
-                abort();
+                exit(EXIT_FAILURE);
             }
 
             if (ioctl(wdtFd_, WDIOC_SQ_KEEPALIVE, &arg) == -1)
@@ -253,7 +252,7 @@
                 err = errno;
                 sprintf(la_buf, "[CWdTimer::SetWatchdogTimerMin], Keep alive failed. (Error: %s)\n", strerror(err));
                 monproc_log_write(MON_WDTIMER_SETMIN_WATCHTIMER_2, SQ_LOG_ERR, la_buf);
-                abort();
+                exit(EXIT_FAILURE);
             }
 
             if (trace_settings & TRACE_INIT)
@@ -315,7 +314,7 @@
 
         if (!(WDT_StartupTimerValueC = getenv("SQ_WDT_STARTUPTIMERVALUE")))
         {
-            WDT_StartupTimerValue = WDT_StartupTimerDefault;
+            WDT_StartupTimerValue = WDT_STARTUPTIMERDEFAULT;
         }
         else
         {
@@ -324,7 +323,7 @@
 
         if (!(WDT_KeepAliveTimerValueC = getenv("SQ_WDT_KEEPALIVETIMERVALUE")))
         {
-            wdtKeepAliveTimerValue_ = WDT_KeepAliveTimerDefault;
+            wdtKeepAliveTimerValue_ = WDT_KEEPALIVETIMERDEFAULT;
         }
         else
         {
@@ -376,7 +375,7 @@
                 err = errno;
                 sprintf(la_buf, "[CWdTimer::StartWatchdogTimer], Set timeout failed. (Error: %s)\n", strerror(err));
                 monproc_log_write(MON_WDTIMER_START_WATCHTIMER_2, SQ_LOG_ERR, la_buf);
-                abort();
+                exit(EXIT_FAILURE);
             }
 
             if (ioctl(fd, WDIOC_SQ_KEEPALIVE, &arg) == -1)
@@ -384,7 +383,7 @@
                 err = errno;
                 sprintf(la_buf, "[CWdTimer::StartWatchdogTimer], Keep alive failed. (Error: %s)\n", strerror(err));
                 monproc_log_write(MON_WDTIMER_START_WATCHTIMER_3, SQ_LOG_ERR, la_buf);
-                abort();
+                exit(EXIT_FAILURE);
             }
 
             timer = wdtKeepAliveTimerValue_;
@@ -393,7 +392,7 @@
                 err = errno;
                 sprintf(la_buf, "[CWdTimer::StartWatchdogTimer], Set Timeout failed. (Error: %s)\n", strerror(err));
                 monproc_log_write(MON_WDTIMER_START_WATCHTIMER_4, SQ_LOG_ERR, la_buf);
-                abort();
+                exit(EXIT_FAILURE);
             }
             else
             {
diff --git a/core/sqf/monitor/linux/zclient.cxx b/core/sqf/monitor/linux/zclient.cxx
index 19a7679..38b629d 100644
--- a/core/sqf/monitor/linux/zclient.cxx
+++ b/core/sqf/monitor/linux/zclient.cxx
@@ -25,6 +25,7 @@
 #include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
+#include <sys/resource.h>
 #include <signal.h>
 #include <ctype.h>
 #include <string.h>
@@ -40,9 +41,9 @@
 #include "montrace.h"
 #include "monlogging.h"
 #include "reqqueue.h"
-#include "pnode.h"
 #include "type2str.h"
 #include "zclient.h"
+#include "pnode.h"
 
 //
 // The following specify the default values for the timers if the
@@ -52,10 +53,8 @@
 #define ZCLIENT_MY_ZNODE_CHECKRATE            5 // seconds
 #define ZCLIENT_SESSION_TIMEOUT              60 // seconds (1 minute)
 
-// The monitors register their znodes under the cluster znode
-#define ZCLIENT_CLUSTER_ZNODE               "/cluster"
-
 // zookeeper connection retries
+#define ZOOKEEPER_CHILD_RETRY_COUNT           5
 #define ZOOKEEPER_RETRY_COUNT                 3
 #define ZOOKEEPER_RETRY_WAIT                  1 // seconds
 
@@ -64,13 +63,16 @@
 extern char Node_name[MAX_PROCESSOR_NAME];
 extern int MyPNID;
 extern int MyNid;
-extern int MyPid;
+extern int MyPid;                                               
 
-extern CNodeContainer *Nodes;
 extern CReqQueue ReqQueue;
 extern CZClient    *ZClient;
 extern CMonLog     *MonLog;
+extern CNodeContainer *Nodes;
+extern CNode *MyNode;
 extern bool debugFlag;
+extern bool IsAgentMode;
+extern bool IsMaster;
 
 static zhandle_t *ZHandle;
 static clientid_t MyZooId;
@@ -103,12 +105,20 @@
             return "ZC_DISABLED";
         case CZClient::ZC_START:
             return "ZC_START";
-        case CZClient::ZC_CLUSTER:
-            return "ZC_CLUSTER";
-        case CZClient::ZC_ZNODE:
-            return "ZC_ZNODE";
         case CZClient::ZC_WATCH:
             return "ZC_WATCH";
+        case CZClient::ZC_CLUSTER:
+            return "ZC_CLUSTER";
+        case CZClient::ZC_ZNODE_CREATED:
+            return "ZC_ZNODE_CREATED";
+        case CZClient::ZC_ZNODE_CHANGED:
+            return "ZC_ZNODE_CHANGED";
+        case CZClient::ZC_ZNODE_CHILD:
+            return "ZC_ZNODE_CHILD";
+        case CZClient::ZC_ZNODE_DELETED:
+            return "ZC_ZNODE_DELETED";
+        case CZClient::ZC_MYZNODE:
+            return "ZC_MYZNODE";
         case CZClient::ZC_STOP:
             return "ZC_STOP";
         case CZClient::ZC_SHUTDOWN:
@@ -119,6 +129,36 @@
     return "ZClient State Invalid";
 }
 
+// ZClientThread main
+static void *ZClientThread(void *arg)
+{
+    const char method_name[] = "ZClientThread";
+    TRACE_ENTRY;
+
+    // Parameter passed to the thread is an instance of the CommAccept object
+    CZClient *zooClient = (CZClient *) arg;
+
+    // Mask all allowed signals 
+    sigset_t  mask;
+    sigfillset(&mask);
+    sigdelset(&mask, SIGPROF); // allows profiling such as google profiler
+    int rc = pthread_sigmask(SIG_SETMASK, &mask, NULL);
+    if (rc != 0)
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf(buf, sizeof(buf), "[%s], pthread_sigmask error=%d\n",
+                 method_name, rc);
+        mon_log_write(MON_ZCLIENT_ZCLIENTTHREAD_1, SQ_LOG_ERR, buf);
+    }
+
+    // Enter thread processing loop
+    zooClient->MonitorCluster();
+
+    TRACE_EXIT;
+    return NULL;
+}
+
+
 void ZSessionWatcher( zhandle_t *zzh
                     , int type
                     , int state
@@ -173,46 +213,24 @@
         {
             char buf[MON_STRING_BUF_SIZE];
             snprintf( buf, sizeof(buf)
-                    , "[%s], Error Zookeeper authentication failure. Node going down...\n"
+                    , "[%s], Error Zookeeper authentication failure. Node going down (terminating!) ...\n"
                     ,  method_name );
             mon_log_write(MON_ZCLIENT_ZSESSIONWATCHER_1, SQ_LOG_CRIT, buf);
 
-            HandleMyNodeExpiration();
-
-            zookeeper_close( zzh );
-            ZHandle=0;
+            mon_failure_exit();
         }
         else if ( state == ZOO_EXPIRED_SESSION_STATE )
         {
             char buf[MON_STRING_BUF_SIZE];
             snprintf( buf, sizeof(buf)
-                    , "[%s], Error Zookeeper session expired. Node going down...\n"
+                    , "[%s], Error Zookeeper session expired. Node going down (terminating!) ...\n"
                     ,  method_name );
             mon_log_write(MON_ZCLIENT_ZSESSIONWATCHER_2, SQ_LOG_CRIT, buf);
 
-            HandleMyNodeExpiration();
-
-            zookeeper_close( zzh );
-            ZHandle=0;
+            mon_failure_exit();
         }
     }
-    else if ( type == ZOO_CREATED_EVENT )
-    {
-        ZClient->TriggerCheck( type, path );
-    }
-    else if ( type == ZOO_DELETED_EVENT )
-    {
-        ZClient->TriggerCheck( type, path );
-    }
-    else if ( type == ZOO_CHANGED_EVENT )
-    {
-        ZClient->TriggerCheck( type, path );
-    }
-    else if ( type == ZOO_CHILD_EVENT )
-    {
-        ZClient->TriggerCheck( type, path );
-    }
-    else if ( type == ZOO_NOTWATCHING_EVENT )
+    else
     {
         ZClient->TriggerCheck( type, path );
     }
@@ -226,8 +244,9 @@
          :threadId_(0)
          ,state_(ZC_DISABLED)
          ,enabled_(false)
-         ,checkCluster_(false)
+         ,clusterWatchEnabled_(false)
          ,resetMyZNodeFailedTime_(true)
+         ,shutdown_(false)
          ,zcMonitoringRate_(ZCLIENT_MY_ZNODE_CHECKRATE) // seconds
          ,zkQuorumHosts_(quorumHosts)
          ,zkRootNode_(rootNode)
@@ -273,7 +292,8 @@
                 , "[%s], Zookeeper quorum port address not initialized\n"
                 ,  method_name);
         mon_log_write(MON_ZCLIENT_ZCLIENT_1, SQ_LOG_ERR, buf);
-        abort();
+
+        mon_failure_exit();
     }
     else
     {
@@ -302,7 +322,8 @@
                 , "[%s], zookeeper_init() failed for host:port %s\n"
                 , method_name, zkQuorumPort_.str( ).c_str( ));
         mon_log_write(MON_ZCLIENT_ZCLIENT_2, SQ_LOG_ERR, buf);
-        abort();
+
+        mon_failure_exit();
     }
     
     int rc = InitializeZClient();
@@ -313,9 +334,13 @@
                 , "[%s], Failed ZClient initialization (%s)\n"
                 , method_name, zerror(rc) );
         mon_log_write(MON_ZCLIENT_ZCLIENT_3, SQ_LOG_ERR, buf);
-        abort();
+
+        mon_failure_exit();
     }
 
+    ConfiguredZNodesDelete();
+    ErrorZNodesDelete();
+
     TRACE_EXIT;
 }
 
@@ -328,7 +353,9 @@
 
     if (ZHandle)
     {
-        WatchNodeDelete( Node_name );
+        ConfiguredZNodesDelete();
+        ErrorZNodesDelete();
+        RunningZNodeDelete( Node_name );
         zookeeper_close(ZHandle);
         ZHandle = 0;
     }
@@ -336,267 +363,156 @@
     TRACE_EXIT;
 }
 
-void CZClient::CheckCluster( void )
+void CZClient::ClusterMonitoringStart( void )
 {
-    const char method_name[] = "CZClient::CheckCluster";
+    const char method_name[] = "CZClient::ClusterMonitoringStart";
+    TRACE_ENTRY;
+
+    if ( !IsEnabled() )
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Cluster monitoring started!\n\n", method_name, __LINE__ );
+        }
+        EnabledSet( true );
+        StateSet( ZC_WATCH );
+        CLock::wakeOne();
+    }
+
+    TRACE_EXIT;
+}
+
+void CZClient::ClusterMonitoringStop( void )
+{
+    const char method_name[] = "CZClient::ClusterMonitoringStop";
+    TRACE_ENTRY;
+
+    if ( IsEnabled() )
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "\n%s@%d Cluster monitoring stopped!\n", method_name, __LINE__ );
+        }
+        ClusterWatchEnabledSet( false );
+        EnabledSet( false );
+        StateSet( ZC_DISABLED );
+        CLock::wakeOne();
+    }
+
+    TRACE_EXIT;
+}
+
+int CZClient::ConfiguredZNodeCreate( const char *nodeName )
+{
+    const char method_name[] = "CZClient::ConfiguredZNodeCreate";
     TRACE_ENTRY;
 
     int rc;
+
+    lock();
+
+    stringstream newpath;
+    newpath.str( "" );
+    newpath << configuredZNodePath_.c_str() << "/"
+            << nodeName;
+    string configZnode = newpath.str( );
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d ZNodeCreate(%s)\n"
+                    , method_name, __LINE__
+                    , configZnode.c_str() );
+    }
+
+    // Suppress error logging if error == ZNODEEXISTS
+    rc = ZNodeCreate( configZnode.c_str(), NULL, 0, true );
+
+    unlock();
+
+    TRACE_EXIT;
+    return(rc);
+}
+
+int CZClient::ConfiguredZNodeDelete( const char *nodeName )
+{
+    const char method_name[] = "CZClient::ConfiguredZNodeDelete";
+    TRACE_ENTRY;
+
+    int rc;
+
+    lock();
+
+    stringstream newpath;
+    newpath.str( "" );
+    newpath << configuredZNodePath_.c_str() << "/"
+            << nodeName;
+    string configZnode = newpath.str( );
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d ZNodeCreate(%s)\n"
+                    , method_name, __LINE__
+                    , configZnode.c_str() );
+    }
+
+    rc = ZNodeDelete( configZnode );
+
+    unlock();
+
+    TRACE_EXIT;
+    return(rc);
+}
+
+void CZClient::ConfiguredZNodesDelete( void )
+{
+    const char method_name[] = "CZClient::ConfiguredZNodesDelete";
+    TRACE_ENTRY;
+
+    int rc = -1;
     struct String_vector nodes;
 
-    if ( IsCheckCluster() )
+    rc = ConfiguredZNodesGet( &nodes );
+    if ( rc != ZOK && rc != ZNONODE )
     {
-        rc = GetClusterZNodes( &nodes );
-        if ( rc != ZOK )
-        {
-            char buf[MON_STRING_BUF_SIZE];
-            snprintf( buf, sizeof(buf)
-                    , "[%s], GetClusterZNodes() failed!\n"
-                    , method_name );
-            mon_log_write(MON_ZCLIENT_CHECKCLUSTER_1, SQ_LOG_ERR, buf);
-            SetState( CZClient::ZC_STOP );
-            CLock::wakeOne();
-            return;
-        }
-
-        stringstream newpath;
-        string monZnode;
-        string nodeName;
-        int    pnid = -1;
-    
-        if ( nodes.count > 0 )
-        {
-            for (int i = 0; i < nodes.count ; i++ )
-            {
-                newpath.str( "" );
-                newpath << zkRootNode_.c_str() 
-                        << zkRootNodeInstance_.c_str()
-                        << ZCLIENT_CLUSTER_ZNODE << "/"
-                        << nodes.data[i];
-                string monZnode = newpath.str( );
-            
-                rc = GetZNodeData( monZnode, nodeName, pnid );
-                if ( rc != ZOK )
-                {
-                    char buf[MON_STRING_BUF_SIZE];
-                    snprintf( buf, sizeof(buf)
-                            , "[%s], GetZNodeData(%s) failed!\n"
-                            , method_name
-                            , monZnode.c_str() );
-                    mon_log_write(MON_ZCLIENT_CHECKCLUSTER_2, SQ_LOG_ERR, buf);
-                }
-                else
-                {
-                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-                    {
-                        trace_printf( "%s@%d monZnode=%s, nodeName=%s, pnid=%d)\n"
-                                    , method_name, __LINE__
-                                    , monZnode.c_str(), nodeName.c_str(), pnid );
-                    }
-                }
-            }
-            FreeStringVector( &nodes );
-        }
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], ConfiguredZNodesGet() failed!\n"
+                , method_name );
+        mon_log_write(MON_ZCLIENT_CONFIGZNODESDELETE_1, SQ_LOG_ERR, buf);
+        CLock::wakeOne();
+        return;
     }
-    else
+
+    stringstream newpath;
+    string configznode;
+
+    if ( nodes.count > 0 )
     {
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        for (int i = 0; i < nodes.count ; i++ )
         {
-            trace_printf( "%s@%d CheckCluster is NOT set!\n"
-                        , method_name, __LINE__ );
-        }
-    }
-    
-    TRACE_EXIT;
-}
-
-void CZClient::CheckMyZNode( void )
-{
-    const char method_name[] = "CZClient::CheckMyZNode";
-    TRACE_ENTRY;
-
-    int zerr;
-    struct timespec currentTime;
-
-    if ( IsCheckCluster() )
-    {
-        if (resetMyZNodeFailedTime_)
-        {
-            resetMyZNodeFailedTime_ = false;
-            clock_gettime(CLOCK_REALTIME, &myZNodeFailedTime_);
-            myZNodeFailedTime_.tv_sec += (GetSessionTimeout() * 2);
-#if 0
-            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-            {
-                trace_printf( "%s@%d" " - Resetting MyZnode Fail Time %ld(secs)\n"
-                            , method_name, __LINE__
-                            , myZNodeFailedTime_.tv_sec );
-            }
-#endif
-        }
-        if ( ! IsZNodeExpired( Node_name, zerr ) )
-        {
-            if ( zerr == ZCONNECTIONLOSS || zerr == ZOPERATIONTIMEOUT )
-            {
-                // Ignore transient errors with the quorum.
-                // However, if longer than the session
-                // timeout, handle it as a hard error.
-                clock_gettime(CLOCK_REALTIME, &currentTime);
-                if (currentTime.tv_sec > myZNodeFailedTime_.tv_sec)
-                {
-                    char buf[MON_STRING_BUF_SIZE];
-                    snprintf( buf, sizeof(buf)
-                            , "[%s], Zookeeper quorum comm error: %s - Handling my znode (%s) as expired! Node is going down.\n"
-                            , method_name, zerror(zerr), Node_name );
-                    mon_log_write(MON_ZCLIENT_CHECKMYZNODE_1, SQ_LOG_ERR, buf);
-                    HandleMyNodeExpiration();
-                }
-            }
-            else
-            {
-                resetMyZNodeFailedTime_ = true;
-            }
-        }
-        else
-        {
-            char buf[MON_STRING_BUF_SIZE];
-            snprintf( buf, sizeof(buf)
-                    , "[%s], My znode (%s) expired! Node is going down.\n"
-                    , method_name, Node_name );
-            mon_log_write(MON_ZCLIENT_CHECKMYZNODE_2, SQ_LOG_ERR, buf);
-            HandleMyNodeExpiration();
-        }
-    }
-    
-    TRACE_EXIT;
-}
-
-int CZClient::ZooExistRetry(zhandle_t *zh, const char *path, int watch, struct Stat *stat)
-{
-    int retries = 0;
-    int rc;
-    rc = zoo_exists(zh, path, watch, stat);
-
-    // retry when loss zconnection, this may caused by one zookeeper server down
-    while ( rc == ZCONNECTIONLOSS && retries < ZOOKEEPER_RETRY_COUNT)
-    {
-        sleep(ZOOKEEPER_RETRY_WAIT);
-        retries++;
-        rc = zoo_exists(zh, path, watch, stat);
-    }
-    return rc;
-}
-
-const char* CZClient::WaitForAndReturnMaster( bool doWait )
-{
-    const char method_name[] = "CZClient::WaitForAndReturnMaster";
-    TRACE_ENTRY;
-    
-    bool found = false;
-    int rc = -1;
-    int retries = 0;
-    Stat stat;
-
-    struct String_vector nodes = {0, NULL};
-    stringstream ss;
-    ss.str( "" );
-    ss << zkRootNode_.c_str() 
-       << zkRootNodeInstance_.c_str() 
-       << ZCLIENT_MASTER_ZNODE;
-    string masterMonitor( ss.str( ) );
-
-    // wait for 3 minutes for giving up.  
-    while ( (GetState() != ZC_SHUTDOWN) && (!found) && (retries < 180)) 
-    {
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "%s@%d trafCluster=%s\n"
-                        , method_name, __LINE__, masterMonitor.c_str() );
-        }
-        // Verify the existence of the parent ZCLIENT_MASTER_ZNODE
-        rc = ZooExistRetry( ZHandle, masterMonitor.c_str( ), 0, &stat );
+            newpath.str( "" );
+            newpath << configuredZNodePath_.c_str() << "/"
+                    << nodes.data[i];
+            configznode = newpath.str( );
         
-        if ( rc == ZNONODE )
-        {
-            if (doWait == false)
-            {
-                break;
-            } 
-            usleep(1000000); // sleep for a second as to not overwhelm the system   
-            retries++;
-            continue;
-        }
-        else if ( rc == ZOK )
-        {
-            // Now get the list of available znodes in the cluster.
-            //
-            // This will return child znodes for each monitor process that has
-            // registered, including this process.
-            rc = zoo_get_children( ZHandle, masterMonitor.c_str( ), 0, &nodes );
-            if ( nodes.count > 0 )
-            {
-                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-                {
-                    trace_printf( "%s@%d nodes.count=%d\n"
-                                , method_name, __LINE__
-                                , nodes.count );
-                }
-                found = true;
-            }
-            else
-            {
-                if (doWait == false)
-                {
-                    break;
-                }
-                usleep(1000000); // sleep for a second as to not overwhelm the system   
-                retries++;
-                continue;
-            }
-        }
-         
-        else  // error
-        { 
             if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
             {
-                trace_printf( "%s@%d Error (MasterMonitor) WaitForAndReturnMaster returned rc (%d), retries %d\n"
-                        , method_name, __LINE__, rc, retries );
+                trace_printf( "%s@%d Deleting configznode=%s\n"
+                            , method_name, __LINE__
+                            , configznode.c_str() );
             }
-            char buf[MON_STRING_BUF_SIZE];
-            snprintf( buf, sizeof(buf)
-                    , "[%s], ZooExistRetry() for %s failed with error %s\n"
-                    ,  method_name, masterMonitor.c_str( ), zerror(rc));
-            mon_log_write(MON_ZCLIENT_WAITFORANDRETURNMASTER, SQ_LOG_ERR, buf);
-            break;
+
+            ZNodeDelete( configznode );
         }
-    }
-         
-    //should we assert nodes.count == 1?
-    if (found)
-    {
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "%s@%d (MasterMonitor) Master Monitor found (%s/%s)\n"
-                        , method_name, __LINE__, masterMonitor.c_str(), nodes.data[0] );
-        }
-        TRACE_EXIT;
-        return nodes.data[0];
-    }
-    else
-    {
-      if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "%s@%d (MasterMonitor) Master Monitor NOT found\n" , method_name, __LINE__);
-        }
+        FreeStringVector( &nodes );
     }
 
     TRACE_EXIT;
-    return NULL;
 }
 
-int CZClient::GetClusterZNodes( String_vector *nodes )
+int CZClient::ConfiguredZNodesGet( String_vector *nodes )
 {
-    const char method_name[] = "CZClient::GetClusterZNodes";
+    const char method_name[] = "CZClient::ConfiguredZNodesGet";
     TRACE_ENTRY;
 
     bool found = false;
@@ -604,12 +520,7 @@
     int retries = 0;
     Stat stat;
 
-    stringstream ss;
-    ss.str( "" );
-    ss << zkRootNode_.c_str() 
-       << zkRootNodeInstance_.c_str() 
-       << ZCLIENT_CLUSTER_ZNODE;
-    string trafCluster( ss.str( ) );
+    string configznodes( configuredZNodePath_.c_str() );
 
     nodes->count = 0;
     nodes->data = NULL;
@@ -618,11 +529,11 @@
     {
         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
         {
-            trace_printf( "%s@%d trafCluster=%s\n"
-                        , method_name, __LINE__, trafCluster.c_str() );
+            trace_printf( "%s@%d configznodes=%s\n"
+                        , method_name, __LINE__, configznodes.c_str() );
         }
-        // Verify the existence of the parent ZCLIENT_CLUSTER_ZNODE
-        rc = ZooExistRetry( ZHandle, trafCluster.c_str( ), 0, &stat );
+        // Verify the existence of the parent
+        rc = ZooExistRetry( ZHandle, configznodes.c_str( ), 0, &stat );
         if ( rc == ZNONODE )
         {
             if (retries > 10)
@@ -636,8 +547,8 @@
             //
             // This will return child znodes for each monitor process that has
             // registered, including this process.
-            rc = zoo_get_children( ZHandle, trafCluster.c_str( ), 0, nodes );
-            if ( nodes->count > 0 )
+            rc = zoo_get_children( ZHandle, configznodes.c_str( ), 0, nodes );
+            if ( rc == ZOK )
             {
                 if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
                 {
@@ -649,7 +560,605 @@
             }
             else
             {
-                if (retries > 10)
+                char buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf)
+                        , "[%s], zoo_get_children(%s) failed with error %s\n"
+                        ,  method_name, configznodes.c_str( ), zerror(rc));
+                mon_log_write(MON_ZCLIENT_CONFIGZNODESGET_1, SQ_LOG_ERR, buf);
+                break;
+            }
+        }
+        else  // error
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s], zoo_exists(%s) failed with error %s\n"
+                    ,  method_name, configznodes.c_str( ), zerror(rc));
+            mon_log_write(MON_ZCLIENT_CONFIGZNODESGET_2, SQ_LOG_ERR, buf);
+            break;
+        }
+    }
+
+    TRACE_EXIT;
+    return( rc );
+}
+
+void CZClient::ConfiguredZNodesWatchSet( void )
+{
+    const char method_name[] = "CZClient::ConfiguredZNodesWatchSet";
+    TRACE_ENTRY;
+
+    int rc;
+
+    stringstream configpath;
+    string confignode;
+
+    configpath.str( "" );
+    configpath << configuredZNodePath_.c_str();
+    confignode = configpath.str( );
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d Setting watch set on confignode=%s\n"
+                    , method_name, __LINE__
+                    , confignode.c_str() );
+    }
+
+    rc = ZNodeWatchChildSet( confignode );
+    if ( rc != ZOK )
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], ZNodeWatchChildSet(%s) failed!\n"
+                , confignode.c_str()
+                , method_name );
+        mon_log_write(MON_ZCLIENT_CONFIGZNODESWATCHSET_1, SQ_LOG_ERR, buf);
+
+        TRACE_EXIT;
+        return;
+    }
+    else
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Watch set on confignode=%s\n"
+                        , method_name, __LINE__
+                        , confignode.c_str() );
+        }
+    }
+    
+    TRACE_EXIT;
+}
+
+int CZClient::ConfiguredZNodeWatchAdd( void )
+{
+    const char method_name[] = "CZClient::ConfiguredZNodeWatchAdd";
+    TRACE_ENTRY;
+
+    int rc;
+    string configznode = configuredZNodePath_.c_str();
+
+    lock();
+    rc = ZNodeWatchSet( configznode );
+    unlock();
+    if ( rc != ZOK )
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], ZNodeWatchSet(%s) failed!\n"
+                , method_name
+                , configznode.c_str() );
+        mon_log_write(MON_ZCLIENT_CONFZNODEWATCHADD_1, SQ_LOG_ERR, buf);
+    }
+    else
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Watch set on configznode=%s\n"
+                        , method_name, __LINE__
+                        , configznode.c_str() );
+        }
+    }
+
+    TRACE_EXIT;
+    return(rc);
+}
+
+int CZClient::ConfiguredZNodeWatchDelete( void )
+{
+    const char method_name[] = "CZClient::ConfiguredZNodeWatchDelete";
+    TRACE_ENTRY;
+
+    int rc = -1;
+
+    string configznode = configuredZNodePath_.c_str();
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d Deleting configznode(%s)\n"
+                    , method_name, __LINE__
+                    , configznode.c_str() );
+    }
+    rc = ZNodeWatchReset( configznode );
+    if ( rc == ZOK )
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], configznode (%s) deleted!\n"
+                , method_name, configznode.c_str() );
+        mon_log_write(MON_ZCLIENT_CONFZNODEWATCHDELETE_1, SQ_LOG_INFO, buf);
+    }
+
+    TRACE_EXIT;
+    return( rc );
+}
+
+int CZClient::ErrorZNodeCreate( const char *errorNode )
+{
+    const char method_name[] = "CZClient::ErrorZNodeCreate";
+    TRACE_ENTRY;
+
+    int rc;
+    int zerr;
+
+    lock();
+    if ( IsRunningZNodeExpired( errorNode, zerr ) )
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Running znode %s already expired (%s)\n"
+                        , method_name, __LINE__
+                        , errorNode
+                        , zerror(zerr) );
+        }
+        unlock();
+        return(ZOK);
+    }
+    unlock();
+    pthread_yield();
+    lock();
+
+    stringstream errorpath;
+    errorpath.str( "" );
+    errorpath << errorZNodePath_.c_str() << "/"
+              << errorNode;
+    string errorznode = errorpath.str( );
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d Error ZNodeCreate(%s)\n"
+                    , method_name, __LINE__
+                    , errorznode.c_str() );
+    }
+
+    // Suppress error logging if error == ZNODEEXISTS
+    rc = ZNodeCreate( errorznode.c_str(), NULL, 0, true );
+
+    errorpath.str( "" );
+    errorpath << errorZNodePath_.c_str() << "/"
+              << errorNode << "/"
+              << Node_name;
+    errorznode = errorpath.str( );
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d Error child ZNodeCreate(%s)\n"
+                    , method_name, __LINE__
+                    , errorznode.c_str() );
+    }
+
+    // Suppress error logging if error == ZNODEEXISTS
+    rc = ZNodeCreate( errorznode.c_str(), NULL, 0, true );
+
+    unlock();
+
+    TRACE_EXIT;
+    return(rc);
+}
+
+// The errorNode is the znode which contains more than one errorChildNodes
+// and whose corresponding running znode is deleted to bring its node down
+// (see CZClient::HandleErrorChildZNodes())
+// The possibility exist that each errorChildNode is also an errorNode under
+// errorZNodePath_ if the errorNode passed in could not communicate with
+// one or more errorChildNodes.
+// Therefore, the each errorChildNode that is also an errorNode and it child 
+// znode must be also be deleted. 
+// For example, if the error znode tree is as follows:
+//   o node-b is the errorNode
+//       /trafodion/1/cluster/error/node-a/node-b
+//       /trafodion/1/cluster/error/node-b/node-a
+//       /trafodion/1/cluster/error/node-b/node-c
+//       /trafodion/1/cluster/error/node-c/node-b
+//   o Therefore,
+//       ErrorZNodeDelete( node-b, errorChildNodes-of-node-b )
+//           Delete(/trafodion/1/cluster/error/node-a/node-b)
+//           Delete(/trafodion/1/cluster/error/node-a)
+//           Delete(/trafodion/1/cluster/error/node-c/node-b)
+//           Delete(/trafodion/1/cluster/error/node-c)
+//           Delete(/trafodion/1/cluster/error/node-b/node-a)
+//           Delete(/trafodion/1/cluster/error/node-b/node-b)
+//           Delete(/trafodion/1/cluster/error/node-b)
+int CZClient::ErrorZNodeDelete( const char *errorNode, String_vector *errorChildNodes )
+{
+    const char method_name[] = "CZClient::ErrorZNodeDelete";
+    TRACE_ENTRY;
+
+    int rc = -1;
+    struct String_vector childnodes;
+
+    lock();
+
+    stringstream errorpath;
+    stringstream childpath;
+    string errorznode;
+    string childznode;
+
+    errorpath.str( "" );
+    errorpath << errorZNodePath_.c_str() << "/"
+              << errorNode;
+    errorznode = errorpath.str( );
+
+retry:
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        for (int i = 0; i < errorChildNodes->count ; i++ )
+        {
+            trace_printf( "%s@%d errorNode=%s, errorChildNodes.count=%d, errorChildNode[%d]=%s\n"
+                        , method_name, __LINE__
+                        , errorNode
+                        , errorChildNodes->count
+                        , i
+                        , errorChildNodes->data[i] );
+        }
+        trace_printf( "%s@%d Processing delete of errorznode=%s\n"
+                    , method_name, __LINE__
+                    , errorznode.c_str() );
+    }
+
+    if ( errorChildNodes->count > 0 )
+    {
+        for (int j = 0; j < errorChildNodes->count ; j++ )
+        {
+            rc = ErrorZNodesGetChild( errorChildNodes->data[j], &childnodes );
+            if (rc == ZOK)
+            {
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d errorNode=%s, errorChildNode=%s, childnodes.count=%d\n"
+                                , method_name, __LINE__
+                                , errorNode
+                                , errorChildNodes->data[j]
+                                , childnodes.count );
+                }
+
+                if (strcmp( errorChildNodes->data[j], errorNode) == 0)
+                {
+                    FreeStringVector( &childnodes );
+                    continue;
+                }
+
+                if (childnodes.count == 1 )
+                {
+                    ErrorChildZNodeDelete( errorNode, errorChildNodes->data[j], &childnodes );
+                }
+                FreeStringVector( &childnodes );
+            }
+
+            childpath.str( "" );
+            childpath << errorZNodePath_ << "/"
+                      << errorNode << "/"
+                      << errorChildNodes->data[j];
+            childznode = childpath.str( );
+
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d Deleting childznode=%s\n"
+                            , method_name, __LINE__
+                            , childznode.c_str() );
+            }
+            
+            ZNodeDelete( childznode );
+        }
+    }
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d Deleting errorznode=%s\n"
+                    , method_name, __LINE__
+                    , errorznode.c_str() );
+    }
+
+    rc = ZNodeDelete( errorznode );
+    if (rc == ZNOTEMPTY)
+    {
+        FreeStringVector( errorChildNodes );
+        rc = ErrorZNodesGetChild( errorNode, errorChildNodes );
+        if ( rc != ZOK && rc != ZNONODE)
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s], ErrorZNodesGetChild() failed!\n"
+                    , method_name );
+            mon_log_write(MON_ZCLIENT_HNDLEERRORCHILDZNODES_1, SQ_LOG_ERR, buf);
+            CLock::wakeOne();
+            return(rc);
+        }
+    
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Retry deleting errorznode=%s\n"
+                        , method_name, __LINE__
+                        , errorznode.c_str() );
+        }
+
+        goto retry;
+    }
+
+    unlock();
+
+    TRACE_EXIT;
+    return(rc);
+}
+
+int CZClient::ErrorChildZNodeDelete( const char *errorNode
+                                   , const char *errorChildNode
+                                   , String_vector *errorChildNodes )
+{
+    const char method_name[] = "CZClient::ErrorChildZNodeDelete";
+    TRACE_ENTRY;
+
+    int rc1 = -1;
+    int rc2 = -1;
+    stringstream errorpath;
+    stringstream childpath;
+    string errorznode;
+    string errorchildznode;
+    string childznode;
+
+    errorpath.str( "" );
+    errorpath << errorZNodePath_.c_str() << "/"
+              << errorNode << "/"
+              << errorChildNode;
+    errorchildznode = errorpath.str( );
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        for (int i = 0; i < errorChildNodes->count ; i++ )
+        {
+            trace_printf( "%s@%d errorNode=%s, errorChildNode=%s, errorChildNodes.count=%d, errorChildNode[%d]=%s\n"
+                        , method_name, __LINE__
+                        , errorNode
+                        , errorChildNode
+                        , errorChildNodes->count
+                        , i
+                        , errorChildNodes->data[i] );
+        }
+        trace_printf( "%s@%d Processing delete of errorchildznode=%s\n"
+                    , method_name, __LINE__
+                    , errorchildznode.c_str() );
+    }
+
+    if ( errorChildNodes->count > 0 )
+    {
+        for (int j = 0; j < errorChildNodes->count ; j++ )
+        {
+            if (strcmp( errorChildNodes->data[j], errorNode) == 0)
+            {
+                childpath.str( "" );
+                childpath << errorZNodePath_ << "/"
+                          << errorChildNode << "/"
+                          << errorNode;
+                childznode = childpath.str( );
+
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d Deleting childznode=%s\n"
+                                , method_name, __LINE__
+                                , childznode.c_str() );
+                }
+
+                rc1 = ZNodeDelete( childznode );
+
+                childpath.str( "" );
+                childpath << errorZNodePath_ << "/"
+                          << errorChildNode;
+                childznode = childpath.str( );
+    
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d Deleting childznode=%s\n"
+                                , method_name, __LINE__
+                                , childznode.c_str() );
+                }
+                
+                rc2 = ZNodeDelete( childznode );
+            }
+        }
+    }
+
+    TRACE_EXIT;
+    return((rc1 != ZOK)?rc1:rc2);
+}
+
+int CZClient::ErrorZNodesGet( String_vector *nodes, bool doRetries )
+{
+    const char method_name[] = "CZClient::ErrorZNodesGet";
+    TRACE_ENTRY;
+
+    bool found = false;
+    int rc = -1;
+    int retries = 0;
+    Stat stat;
+
+    string errorznodes( errorZNodePath_.c_str() );
+
+    nodes->count = 0;
+    nodes->data = NULL;
+
+    while ( !found )
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d errorznode=%s\n"
+                        , method_name, __LINE__, errorznodes.c_str() );
+        }
+
+        // Verify the existence of the parent
+        rc = ZooExistRetry( ZHandle, errorznodes.c_str( ), 0, &stat );
+        if ( rc == ZNONODE )
+        {
+            if (doRetries)
+            {
+                if (retries > ZOOKEEPER_RETRY_COUNT)
+                    break;
+                retries++;    
+                continue;
+            }
+        }
+        else if ( rc == ZOK )
+        {
+            // Now get the list of available znodes in the cluster.
+            //
+            // This will return child znodes for each monitor process that has
+            // registered, including this process.
+            rc = zoo_get_children( ZHandle, errorznodes.c_str( ), 0, nodes );
+            if ( rc == ZOK )
+            {
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d errorznode=%s, errornodes.count=%d\n"
+                                , method_name, __LINE__
+                                , errorznodes.c_str()
+                                , nodes->count );
+                    for (int i = 0; i < nodes->count ; i++ )
+                    {
+                        trace_printf( "%s@%d errornodes[%d]=%s\n"
+                                    , method_name, __LINE__
+                                    , i
+                                    , nodes->data[i] );
+                    }
+                }
+                if (doRetries)
+                {
+                    if ( nodes->count && nodes->count < 2 )
+                    {
+                        unlock();
+                        sleep(ZOOKEEPER_RETRY_WAIT);
+                        lock();
+                        if (retries < ZOOKEEPER_CHILD_RETRY_COUNT)
+                        {
+                            // Wait a bit to see if at least one other node is
+                            // having communications problems with the same node
+                            retries++;    
+                            continue;
+                        }
+                        found = true;
+                    }
+                    else
+                    {
+                        unlock();
+                        sleep(ZOOKEEPER_RETRY_WAIT);
+                        lock();
+                        if (retries > ZOOKEEPER_CHILD_RETRY_COUNT)
+                            break;
+                        retries++;    
+                        continue;
+                    }
+                }
+                else
+                {
+                    break;
+                }
+            }
+            else
+            {
+                char buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf)
+                        , "[%s], zoo_get_children(%s) failed with error %s\n"
+                        ,  method_name, errorznodes.c_str( ), zerror(rc));
+                mon_log_write(MON_ZCLIENT_ERRORZNODESGET_1, SQ_LOG_ERR, buf);
+                break;
+            }
+        }
+        else  // error
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s], zoo_exists(%s) failed with error %s\n"
+                    ,  method_name, errorznodes.c_str( ), zerror(rc));
+            mon_log_write(MON_ZCLIENT_ERRORZNODESGET_2, SQ_LOG_ERR, buf);
+            break;
+        }
+    }
+
+    TRACE_EXIT;
+    return( rc );
+}
+
+int CZClient::ErrorZNodesGetChild( const char *errorNode, String_vector *childnodes )
+{
+    const char method_name[] = "CZClient::ErrorZNodesGetChild";
+    TRACE_ENTRY;
+
+    bool found = false;
+    int rc = -1;
+    int retries = 0;
+    Stat stat;
+
+    stringstream ss;
+    ss.str( "" );
+    ss << errorZNodePath_.c_str() << "/"
+       << errorNode;
+    string errorchildznode( ss.str( ) );
+
+    childnodes->count = 0;
+    childnodes->data = NULL;
+
+    while ( !found )
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d errorchildznode=%s\n"
+                        , method_name, __LINE__, errorchildznode.c_str() );
+        }
+        // Verify the existence of the parent
+        rc = ZooExistRetry( ZHandle, errorchildznode.c_str( ), 0, &stat );
+        if ( rc == ZNONODE )
+        {
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d errorchildznode=%s does not exist!\n"
+                            , method_name, __LINE__
+                            , errorchildznode.c_str( ) );
+            }
+            break;
+        }
+        else if ( rc == ZOK )
+        {
+            // Now get the list of available znodes in the cluster.
+            //
+            // This will return child znodes for each monitor process that has
+            // registered, including this process.
+            rc = zoo_get_children( ZHandle, errorchildznode.c_str( ), 0, childnodes );
+
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d errorNode=%s, childnodes.count=%d\n"
+                            , method_name, __LINE__
+                            , errorNode
+                            , childnodes->count );
+            }
+
+            if ( childnodes->count > 0 )
+            {
+                found = true;
+            }
+            else
+            {
+                sleep(ZOOKEEPER_RETRY_WAIT);
+                if (retries > ZOOKEEPER_CHILD_RETRY_COUNT)
                     break;
                 retries++;    
                 continue;
@@ -660,8 +1169,8 @@
             char buf[MON_STRING_BUF_SIZE];
             snprintf( buf, sizeof(buf)
                     , "[%s], zoo_exists() for %s failed with error %s\n"
-                    ,  method_name, trafCluster.c_str( ), zerror(rc));
-            mon_log_write(MON_ZCLIENT_GETCLUSTERZNODES_2, SQ_LOG_ERR, buf);
+                    ,  method_name, errorchildznode.c_str( ), zerror(rc));
+            mon_log_write(MON_ZCLIENT_ERRORCHILDZNODESGET_1, SQ_LOG_ERR, buf);
             break;
         }
     }
@@ -670,196 +1179,240 @@
     return( rc );
 }
 
-int CZClient::GetZNodeData( string &monZnode, string &nodeName, int &pnid )
+void CZClient::ErrorZNodesDelete( void )
 {
-    const char method_name[] = "CZClient::GetZNodeData";
+    const char method_name[] = "CZClient::ErrorZNodesDelete";
     TRACE_ENTRY;
 
-    char  pnidStr[8] = { 0 };
-    char *tkn = NULL;
-    char  zkData[MAX_PROCESSOR_NAME];
-    int   rc = -1;
-    int   zkDataLen = sizeof(zkData);
-    Stat  stat;
+    int rc = -1;
+    struct String_vector errornodes;
+    struct String_vector childnodes;
 
-    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-    {
-        trace_printf( "%s@%d monZnode=%s\n"
-                    , method_name, __LINE__, monZnode.c_str() );
-    }
-    rc = ZooExistRetry( ZHandle, monZnode.c_str( ), 0, &stat );
-    if ( rc == ZNONODE )
-    {
-        // return the error
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "%s@%d monZnode=%s does not exist (ZNONODE)\n"
-                        , method_name, __LINE__, monZnode.c_str() );
-        }
-    }
-    else if ( rc == ZOK )
-    {
-        // Get the pnid from the data part of znode
-        rc = zoo_get( ZHandle, monZnode.c_str( ), false, zkData, &zkDataLen, &stat );
-        if ( rc == ZOK )
-        {
-            // The first token is the node name
-            tkn = strtok( zkData, ":" );
-            if ( tkn != NULL )
-            {
-                nodeName = tkn;
-            }
-            tkn = strtok( NULL, ":" );
-            if ( tkn != NULL )
-            {
-                strcpy( pnidStr, tkn );
-                pnid = atoi( pnidStr );
-            }
-            // TODO: Save monZnode path in corresponding physical node object
-            //       to match with when ZC_NODE is triggered
-        }
-        else
-        {
-            char buf[MON_STRING_BUF_SIZE];
-            snprintf( buf, sizeof(buf)
-                    , "[%s], zoo_get() for %s failed with error %s\n"
-                    ,  method_name, monZnode.c_str( ), zerror(rc));
-            mon_log_write(MON_ZCLIENT_GETZNODEDATA_2, SQ_LOG_ERR, buf);
-        }
-    }
-    else
+    lock();
+    rc = ErrorZNodesGet( &errornodes );
+    unlock();
+    if ( rc != ZOK && rc != ZNONODE )
     {
         char buf[MON_STRING_BUF_SIZE];
         snprintf( buf, sizeof(buf)
-                , "[%s], zoo_exists() for %s failed with error %s\n"
-                ,  method_name, monZnode.c_str( ), zerror(rc));
-        mon_log_write(MON_ZCLIENT_GETZNODEDATA_3, SQ_LOG_ERR, buf);
+                , "[%s], ErrorZNodesGet() failed!\n"
+                , method_name );
+        mon_log_write(MON_ZCLIENT_ERRORZNODESDELETE_1, SQ_LOG_ERR, buf);
+        CLock::wakeOne();
+        return;
+    }
+
+    stringstream errorpath;
+    stringstream childpath;
+    string errorznode;
+    string childznode;
+
+    if ( errornodes.count > 0 )
+    {
+        for (int i = 0; i < errornodes.count ; i++ )
+        {
+            errorpath.str( "" );
+            errorpath << errorZNodePath_.c_str() << "/"
+                      << errornodes.data[i];
+            errorznode = errorpath.str( );
+        
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d Deleting errorznode=%s\n"
+                            , method_name, __LINE__
+                            , errorznode.c_str() );
+            }
+
+            rc = ErrorZNodesGetChild( errornodes.data[i], &childnodes );
+            if ( rc != ZOK && rc != ZNONODE )
+            {
+                char buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf)
+                        , "[%s], ErrorZNodesGetChild() failed!\n"
+                        , method_name );
+                mon_log_write(MON_ZCLIENT_ERRORZNODESDELETE_2, SQ_LOG_ERR, buf);
+                CLock::wakeOne();
+                return;
+            }
+
+            if ( childnodes.count > 0 )
+            {
+                for (int j = 0; j < childnodes.count ; j++ )
+                {
+                    childpath.str( "" );
+                    childpath << errorZNodePath_.c_str() << "/"
+                              << errornodes.data[i] << "/"
+                              << childnodes.data[j];
+                    childznode = childpath.str( );
+                
+                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                    {
+                        trace_printf( "%s@%d Deleting childznode=%s\n"
+                                    , method_name, __LINE__
+                                    , childznode.c_str() );
+                    }
+        
+                    ZNodeDelete( childznode );
+                }
+            }
+
+            FreeStringVector( &childnodes );
+            ZNodeDelete( errorznode );
+        }
+        FreeStringVector( &errornodes );
+    }
+
+    TRACE_EXIT;
+}
+
+void CZClient::ErrorZNodesWatchSet( void )
+{
+    const char method_name[] = "CZClient::ErrorZNodesWatchSet";
+    TRACE_ENTRY;
+
+    int rc;
+
+    stringstream errorpath;
+    string errornode;
+
+    errornode = errorZNodePath_.c_str();
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d Setting watch set on errornode=%s\n"
+                    , method_name, __LINE__
+                    , errornode.c_str() );
+    }
+
+    rc = ZNodeWatchChildSet( errornode );
+    if ( rc != ZOK )
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], ZNodeWatchChildSet(%s) failed!\n"
+                , errornode.c_str()
+                , method_name );
+        mon_log_write(MON_ZCLIENT_ERRORZNODESWATCHSET_1, SQ_LOG_ERR, buf);
+
+        TRACE_EXIT;
+        return;
+    }
+    else
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Watch set on errornode=%s\n"
+                        , method_name, __LINE__
+                        , errornode.c_str() );
+        }
+    }
+    
+    TRACE_EXIT;
+}
+
+int CZClient::ErrorZNodeWatchAdd( void )
+{
+    const char method_name[] = "CZClient::ErrorZNodeWatchAdd";
+    TRACE_ENTRY;
+
+    int rc;
+    string errorznode = errorZNodePath_.c_str();
+
+    lock();
+    rc = ZNodeWatchSet( errorznode );
+    unlock();
+    if ( rc != ZOK )
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], ZNodeWatchSet(%s) failed!\n"
+                , method_name
+                , errorznode.c_str() );
+        mon_log_write(MON_ZCLIENT_ERRORZNODEWATCHADD_1, SQ_LOG_ERR, buf);
+    }
+    else
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Watch set on errorznode=%s\n"
+                        , method_name, __LINE__
+                        , errorznode.c_str() );
+        }
+    }
+
+    TRACE_EXIT;
+    return(rc);
+}
+
+int CZClient::ErrorZNodeWatchDelete( void )
+{
+    const char method_name[] = "CZClient::ErrorZNodeWatchDelete";
+    TRACE_ENTRY;
+
+    int rc = -1;
+
+    string errorznode = errorZNodePath_.c_str();
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d Deleting errorznode(%s)\n"
+                    , method_name, __LINE__
+                    , errorznode.c_str() );
+    }
+    rc = ZNodeWatchReset( errorznode );
+    if ( rc == ZOK )
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], errorznode (%s) deleted!\n"
+                , method_name, errorznode.c_str() );
+        mon_log_write(MON_ZCLIENT_ERRORZNODEWATCHDELETE_1, SQ_LOG_INFO, buf);
     }
 
     TRACE_EXIT;
     return( rc );
 }
 
-void CZClient::HandleMasterZNode ( void )
+void CZClient::HandleChangedZNode( void )
 {
-     const char method_name[] = "CZClient::HandleMasterZNode";
+    const char method_name[] = "CZClient::HandleChangedZNode";
     TRACE_ENTRY;
 
-    char  pathStr[MAX_PROCESSOR_NAME] = { 0 };
-    char  nodeName[MAX_PROCESSOR_NAME] = { 0 };
-    char *tkn = NULL;
-    char *tknStart = pathStr;
-    char *tknLast = NULL;
-    string monZnode;
-    
-    monZnode.assign( znodeQueue_.front() );
-
-    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-    {
-        trace_printf("%s@%d" " - znodePath=%s, znodeQueue_.size=%ld\n"
-                        , method_name, __LINE__
-                        , monZnode.c_str(), znodeQueue_.size() );
-    }
-
-    znodeQueue_.pop_front();
-       
-    strcpy( pathStr, monZnode.c_str() );
-    tknStart++; // skip the first '/'
-    tkn = strtok( tknStart, "/" );
-    do
-    {
-        tknLast = tkn;
-        tkn = strtok( NULL, "/" );
-    }
-    while( tkn != NULL );
-    
-    strcpy( nodeName, tknLast );
-    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-    {
-        trace_printf( "%s@%d nodeName=%s\n"
-                    , method_name, __LINE__
-                    , strlen(nodeName) ? nodeName : "" );
-    }
-       
-    string masterpath = zkRootNode_ + zkRootNodeInstance_ + ZCLIENT_MASTER_ZNODE;
-    std::size_t found = monZnode.find(masterpath);
-    // if it is the master node, then call HandleAssignMonitorLeader
-    if (found!=std::string::npos)
-    // zookeeper node, assume stale
-    {
-        HandleAssignMonitorLeader(nodeName);
-    }
-    
-    TRACE_EXIT; 
-}
-
-void CZClient::HandleExpiredZNode( void )
-{
-    const char method_name[] = "CZClient::HandleExpiredZNode";
-    TRACE_ENTRY;
-
-    if ( IsCheckCluster() )
+    if ( IsClusterWatchEnabled() )
     {
         char  pathStr[MAX_PROCESSOR_NAME] = { 0 };
         char  nodeName[MAX_PROCESSOR_NAME] = { 0 };
-        char *tkn = NULL;
-        char *tknStart = pathStr;
-        char *tknLast = NULL;
-        string monZnode;
+        string znode;
     
-        monZnode.assign( znodeQueue_.front() );
 
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        while (znodeChangedQueue_.size() != 0)
         {
-            trace_printf("%s@%d" " - znodePath=%s, znodeQueue_.size=%ld\n"
-                        , method_name, __LINE__
-                        , monZnode.c_str(), znodeQueue_.size() );
-        }
-
-        znodeQueue_.pop_front();
-        
-        strcpy( pathStr, monZnode.c_str() );
-
-        tknStart++; // skip the first '/'
-        tkn = strtok( tknStart, "/" );
-        do
-        {
-            tknLast = tkn;
-            tkn = strtok( NULL, "/" );
-        }
-        while( tkn != NULL );
-
-        strcpy( nodeName, tknLast );
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "%s@%d nodeName=%s\n"
-                        , method_name, __LINE__
-                        , strlen(nodeName) ? nodeName : "" );
-        }
-
-        string masterpath = zkRootNode_ + zkRootNodeInstance_ + ZCLIENT_MASTER_ZNODE;
-        std::size_t found = monZnode.find(masterpath);
-        // if it is not the master node, then call HandleNodeExpiration
-        if (found==std::string::npos)
-        {    
-             char buf[MON_STRING_BUF_SIZE];
-             snprintf( buf, sizeof(buf)
-                , "[%s], %s was deleted, handling node (%s) as a down node!\n"
-                ,  method_name, monZnode.c_str(), nodeName );
-              mon_log_write(MON_ZCLIENT_CHECKZNODE_1, SQ_LOG_ERR, buf);
-         
-             HandleNodeExpiration( nodeName );
-        }
-        else // zookeeper node, assume stale
-        {
-             HandleAssignMonitorLeader(nodeName);
+            znode.assign( znodeChangedQueue_.front() );
+    
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf("%s@%d" " - znodePath=%s, znodeChangedQueue_.size=%ld\n"
+                            , method_name, __LINE__
+                            , znode.c_str(), znodeChangedQueue_.size() );
+            }
+    
+            znodeChangedQueue_.pop_front();
+            
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d nodeName=%s\n"
+                            , method_name, __LINE__
+                            , strlen(nodeName) ? nodeName : "" );
+            }
+    
+            HandleNodeChange( nodeName );
         }
     }
     else
     {
         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
         {
-            trace_printf( "%s@%d CheckCluster is NOT set!\n"
+            trace_printf( "%s@%d ClusterWatchEnabled is NOT set!\n"
                         , method_name, __LINE__ );
         }
     }
@@ -867,6 +1420,483 @@
     TRACE_EXIT;
 }
 
+void CZClient::HandleChildZNode( void )
+{
+    const char method_name[] = "CZClient::HandleChildZNode";
+    TRACE_ENTRY;
+
+    if ( IsClusterWatchEnabled() )
+    {
+        char  pathStr[MAX_PROCESSOR_NAME] = { 0 };
+        char  nodeName[MAX_PROCESSOR_NAME] = { 0 };
+        char *tkn = NULL;
+        char *tknStart = pathStr;
+        char *tknLast = NULL;
+        string znode;
+
+        while (znodeChildQueue_.size() != 0)
+        {
+            znode.assign( znodeChildQueue_.front() );
+    
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf("%s@%d" " - znodePath=%s, znodeChildQueue_.size=%ld\n"
+                            , method_name, __LINE__
+                            , znode.c_str(), znodeChildQueue_.size() );
+            }
+    
+            znodeChildQueue_.pop_front();
+            
+            if (znode.compare( configuredZNodePath_ ) == 0)
+            {
+                // The configuredZNodePath_ contains child znodes of each
+                // node in the static configuration.
+                // As node are added or deleted from the static configuration
+                // a correspoding child znode is added or deleted under the
+                // configuredZNodePath_
+                HandleConfiguredZNodes();
+            } 
+            else if (znode.compare( errorZNodePath_ ) == 0)
+            {
+                HandleErrorZNodes();
+            }
+            else
+            {
+                char buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf)
+                        , "[%s], Don't know how to handle children of znode=%s\n"
+                        , method_name
+                        , znode.c_str() );
+                mon_log_write(MON_ZCLIENT_HANDLECHILDZNODE_1, SQ_LOG_ERR, buf);
+            }
+        }
+    }
+    else
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d ClusterWatchEnabled is NOT set!\n"
+                        , method_name, __LINE__ );
+        }
+    }
+    
+    TRACE_EXIT;
+}
+
+void CZClient::HandleConfiguredZNodes( void )
+{
+    const char method_name[] = "CZClient::HandleConfiguredZNodes";
+    TRACE_ENTRY;
+
+    int rc = -1;
+    struct String_vector confignodes;
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d Handling Configured ZNodes!\n"
+                    , method_name, __LINE__ );
+    }
+
+    rc = ConfiguredZNodesGet( &confignodes );
+    if ( rc != ZOK )
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], ConfiguredZNodesGet() failed!\n"
+                , method_name );
+        mon_log_write(MON_ZCLIENT_HANDLEERRORZNODES_1, SQ_LOG_ERR, buf);
+        CLock::wakeOne();
+        return;
+    }
+
+    stringstream configpath;
+    string configznode;
+
+    if ( confignodes.count > 0 )
+    {
+        for (int i = 0; i < confignodes.count ; i++ )
+        {
+            configpath.str( "" );
+            configpath << configuredZNodePath_.c_str() << "/"
+                       << confignodes.data[i];
+            configznode = configpath.str( );
+        
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d Handling configznode=%s\n"
+                            , method_name, __LINE__
+                            , configznode.c_str() );
+            }
+        }
+        HandleNodeConfigurationChange();
+        FreeStringVector( &confignodes );
+    }
+
+    TRACE_EXIT;
+}
+
+void CZClient::HandleCreatedZNode( void )
+{
+    const char method_name[] = "CZClient::HandleCreatedZNode";
+    TRACE_ENTRY;
+
+    if ( IsClusterWatchEnabled() )
+    {
+        char  pathStr[MAX_PROCESSOR_NAME] = { 0 };
+        char  nodeName[MAX_PROCESSOR_NAME] = { 0 };
+        char *tkn = NULL;
+        char *tknStart = pathStr;
+        char *tknLast = NULL;
+        string znode;
+    
+
+        while (znodeCreatedQueue_.size() != 0)
+        {
+            znode.assign( znodeCreatedQueue_.front() );
+    
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf("%s@%d" " - znodePath=%s, znodeCreatedQueue_.size=%ld\n"
+                            , method_name, __LINE__
+                            , znode.c_str(), znodeCreatedQueue_.size() );
+            }
+    
+            znodeCreatedQueue_.pop_front();
+            
+            strcpy( pathStr, znode.c_str() );
+    
+            tknStart++; // skip the first '/'
+            tkn = strtok( tknStart, "/" );
+            do
+            {
+                tknLast = tkn;
+                tkn = strtok( NULL, "/" );
+            }
+            while( tkn != NULL );
+    
+            strcpy( nodeName, tknLast );
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d nodeName=%s\n"
+                            , method_name, __LINE__
+                            , strlen(nodeName) ? nodeName : "" );
+            }
+    
+            HandleNodeCreated( nodeName );
+        }
+    }
+    else
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d ClusterWatchEnabled is NOT set!\n"
+                        , method_name, __LINE__ );
+        }
+    }
+    
+    TRACE_EXIT;
+}
+
+void CZClient::HandleDeletedZNode( void )
+{
+    const char method_name[] = "CZClient::HandleDeletedZNode";
+    TRACE_ENTRY;
+
+    if ( IsClusterWatchEnabled() )
+    {
+        char  pathStr[MAX_PROCESSOR_NAME] = { 0 };
+        char  nodeName[MAX_PROCESSOR_NAME] = { 0 };
+        char *tkn = NULL;
+        char *tknStart = pathStr;
+        char *tknLast = NULL;
+        string znode;
+    
+
+        while (znodeDeletedQueue_.size() != 0)
+        {
+            znode.assign( znodeDeletedQueue_.front() );
+    
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf("%s@%d" " - znodePath=%s, znodeDeletedQueue_.size=%ld\n"
+                            , method_name, __LINE__
+                            , znode.c_str(), znodeDeletedQueue_.size() );
+            }
+    
+            znodeDeletedQueue_.pop_front();
+            
+            strcpy( pathStr, znode.c_str() );
+    
+            tknStart++; // skip the first '/'
+            tkn = strtok( tknStart, "/" );
+            do
+            {
+                tknLast = tkn;
+                tkn = strtok( NULL, "/" );
+            }
+            while( tkn != NULL );
+    
+            strcpy( nodeName, tknLast );
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d nodeName=%s\n"
+                            , method_name, __LINE__
+                            , strlen(nodeName) ? nodeName : "" );
+            }
+    
+            // Invoke the callback to handle the node expiration
+            HandleNodeExpiration( nodeName );
+        }
+    }
+    else
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d ClusterWatchEnabled is NOT set!\n"
+                        , method_name, __LINE__ );
+        }
+    }
+    
+    TRACE_EXIT;
+}
+
+void CZClient::HandleErrorZNode( const char *errorNode, const char *childNode )
+{
+    const char method_name[] = "CZClient::HandleErrorZNode";
+    TRACE_ENTRY;
+
+    int rc = -1;
+    bool deleteErrorznode = false;
+    struct String_vector childnodes;
+    stringstream errorpath;
+    stringstream childpath;
+    string errorznode;
+    string childznode;
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d Handling errorNode=%s\n"
+                    , method_name, __LINE__
+                    , errorNode );
+    }
+
+    rc = ErrorZNodesGetChild( errorNode, &childnodes );
+    if ( rc != ZOK && rc != ZNONODE)
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], ErrorZNodesGetChild() failed!\n"
+                , method_name );
+        mon_log_write(MON_ZCLIENT_HANDLEERRORZNODE_1, SQ_LOG_ERR, buf);
+        return;
+    }
+
+    if ( childnodes.count > 0 )
+    {
+        for (int i = 0; i < childnodes.count ; i++ )
+        {
+            if (strcmp( childnodes.data[i], childNode ) == 0)
+            {
+                errorpath.str( "" );
+                errorpath << errorZNodePath_.c_str() << "/"
+                          << errorNode;
+                errorznode = errorpath.str( );
+
+                childpath.str( "" );
+                childpath << errorpath.str( ) << "/"
+                          << childNode;
+                childznode = childpath.str( );
+
+                // Delete the parent errorznode if it only had one childznode
+                if (childnodes.count == 1)
+                {
+                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                    {
+                        trace_printf( "%s@%d Deleting childznode=%s\n"
+                                    , method_name, __LINE__
+                                    , childznode.c_str() );
+                    }
+                    ZNodeDelete( childznode );
+                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                    {
+                        trace_printf( "%s@%d Deleting errorznode=%s\n"
+                                    , method_name, __LINE__
+                                    , errorznode.c_str() );
+                    }
+                    ZNodeDelete( errorznode );
+                }
+                else if (childnodes.count > 1)
+                {
+                    HandleErrorChildZNodes( errorNode );
+                }
+            }
+        }
+    }
+
+    FreeStringVector( &childnodes );
+
+    TRACE_EXIT;
+}
+
+void CZClient::HandleErrorZNodes( void )
+{
+    const char method_name[] = "CZClient::HandleErrorZNodes";
+    TRACE_ENTRY;
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d Handling Error ZNodes!\n"
+                    , method_name, __LINE__ );
+    }
+
+    int rc = -1;
+    struct String_vector errornodes;
+
+    rc = ErrorZNodesGet( &errornodes, false );
+    if ( rc != ZOK )
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], ErrorZNodesGet() failed!\n"
+                , method_name );
+        mon_log_write(MON_ZCLIENT_HANDLEERRORZNODES_1, SQ_LOG_ERR, buf);
+        return;
+    }
+
+    stringstream errorpath;
+    string errorznode;
+
+    if ( errornodes.count > 0 )
+    {
+        for (int i = 0; i < errornodes.count ; i++ )
+        {
+            errorpath.str( "" );
+            errorpath << errorZNodePath_.c_str() << "/"
+                      << errornodes.data[i];
+            errorznode = errorpath.str( );
+        
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d Handling errorznode=%s\n"
+                            , method_name, __LINE__
+                            , errorznode.c_str() );
+            }
+
+            HandleErrorChildZNodes( errornodes.data[i] );
+        }
+        FreeStringVector( &errornodes );
+    }
+
+    TRACE_EXIT;
+}
+
+void CZClient::HandleErrorChildZNodes( const char *errorNode )
+{
+    const char method_name[] = "CZClient::HandleErrorChildZNodes";
+    TRACE_ENTRY;
+
+    int rc = -1;
+    bool deleteErrorznode = false;
+    struct String_vector childnodes;
+    stringstream errorpath;
+    stringstream childpath;
+    string errorznode;
+    string childznode;
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d Handling errorNode=%s\n"
+                    , method_name, __LINE__
+                    , errorNode );
+    }
+
+    rc = ErrorZNodesGetChild( errorNode, &childnodes );
+    if ( rc != ZOK && rc != ZNONODE)
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], ErrorZNodesGetChild() failed!\n"
+                , method_name );
+        mon_log_write(MON_ZCLIENT_HNDLEERRORCHILDZNODES_1, SQ_LOG_ERR, buf);
+        return;
+    }
+
+    if ( childnodes.count > 1 )
+    {
+        ErrorZNodeDelete( errorNode, &childnodes );
+        // Delete the corresponding running znode which will trigger node down
+        RunningZNodeDelete( errorNode );
+    }
+    else
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Bypassing errorNode=%s, childnodes.count=%d\n"
+                        , method_name, __LINE__
+                        , errorNode
+                        , childnodes.count );
+        }
+    }
+
+    FreeStringVector( &childnodes );
+
+    TRACE_EXIT;
+}
+
+void CZClient::HandleErrorChildZNodesForZNodeChild( const char *childNode, bool doRetries )
+{
+    const char method_name[] = "CZClient::HandleErrorChildZNodesForZNodeChild";
+    TRACE_ENTRY;
+
+    int rc = -1;
+    bool deleteErrorznode = false;
+    struct String_vector errornodes;
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d Handling childNode=%s\n"
+                    , method_name, __LINE__
+                    , childNode );
+    }
+
+    rc = ErrorZNodesGet( &errornodes, doRetries );
+    if ( rc != ZOK )
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], ErrorZNodesGet() failed!\n"
+                , method_name );
+        mon_log_write(MON_ZCLIENT_HNDLERRCHLZNFORZNCHL_1, SQ_LOG_ERR, buf);
+        return;
+    }
+
+    stringstream errorpath;
+    string errorznode;
+
+    if ( errornodes.count > 0 )
+    {
+        for (int i = 0; i < errornodes.count ; i++ )
+        {
+            errorpath.str( "" );
+            errorpath << errorZNodePath_.c_str() << "/"
+                      << errornodes.data[i];
+            errorznode = errorpath.str( );
+        
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d Handling errorznode=%s\n"
+                            , method_name, __LINE__
+                            , errorznode.c_str() );
+            }
+
+            HandleErrorZNode( errornodes.data[i], childNode );
+        }
+        FreeStringVector( &errornodes );
+    }
+
+    TRACE_EXIT;
+}
+
 int CZClient::InitializeZClient( void )
 {
     const char method_name[] = "CZClient::InitializeZClient";
@@ -875,27 +1905,27 @@
     int rc;
     int retries = 0;
 
-    rc = MakeClusterZNodes();
+    rc = ZNodesTreeCreate();
 
     while ( rc != ZOK && retries < ZOOKEEPER_RETRY_COUNT)
     {
         sleep(ZOOKEEPER_RETRY_WAIT);
         retries++;
-        rc = MakeClusterZNodes();
+        rc = ZNodesTreeCreate();
     }
 
     if ( rc == ZOK )
     {
-        rc = RegisterMyNodeZNode();
+        rc = MyRunningZNodeCreate();
     }
 
     TRACE_EXIT;
     return( rc );
 }
 
-bool CZClient::IsZNodeExpired( const char *nodeName, int &zerr )
+bool CZClient::IsRunningZNodeExpired( const char *nodeName, int &zerr )
 {
-    const char method_name[] = "CZClient::IsZNodeExpired";
+    const char method_name[] = "CZClient::IsRunningZNodeExpired";
     TRACE_ENTRY;
 
     bool  expired = false;
@@ -903,9 +1933,7 @@
     Stat  stat;
     stringstream newpath;
     newpath.str( "" );
-    newpath << zkRootNode_.c_str() 
-            << zkRootNodeInstance_.c_str() 
-            << ZCLIENT_CLUSTER_ZNODE << "/"
+    newpath << runningZNodePath_.c_str() << "/"
             << nodeName;
     string monZnode = newpath.str( );
 
@@ -968,20 +1996,157 @@
     return( expired );
 }
 
-int CZClient::CreateMasterZNode(  const char *nodeName )
+bool CZClient::IsZNodeMaster( const char *nodeName )
 {
-    const char method_name[] = "CZClient::CreateMasterZNode";
+    const char method_name[] = "CZClient::IsZNodeMaster";
+    TRACE_ENTRY;
+
+    bool isMaster = false;
+    string masterZNode;
+
+    masterZNode.assign(MasterWaitForAndReturn( true ));
+    
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d masterZNode=%s, nodeName=%s\n"
+                    , method_name, __LINE__
+                    , masterZNode.c_str()
+                    , nodeName );
+    }
+
+    isMaster = (masterZNode.compare( nodeName ) == 0) ? true : false;
+
+    TRACE_EXIT;
+    return( isMaster );
+}
+
+const char* CZClient::MasterWaitForAndReturn( bool doWait )
+{
+    const char method_name[] = "CZClient::MasterWaitForAndReturn";
+    TRACE_ENTRY;
+    
+    bool found = false;
+    int rc = -1;
+    int retries = 0;
+    Stat stat;
+
+    struct String_vector nodes = {0, NULL};
+    string masterMonitor( masterZNodePath_.c_str() );
+
+    // wait for ZCLIENT_MASTER_ZNODE_RETRY_COUNT minutes for giving up.  
+    while ( (StateGet() != ZC_SHUTDOWN) && (!found) && (retries < ZCLIENT_MASTER_ZNODE_RETRY_COUNT)) 
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d masterMonitor path=%s\n"
+                        , method_name, __LINE__, masterMonitor.c_str() );
+        }
+
+        if (MyNode && MyNode->IsPendingNodeDown())
+        {
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d MyNode IsPendingNodeDown=%s\n"
+                            , method_name, __LINE__
+                            , MyNode->IsPendingNodeDown()?"true":"false" );
+            }
+            break;
+        }
+
+        // Verify the existence of the parent ZCLIENT_MASTER_ZNODE
+        rc = ZooExistRetry( ZHandle, masterMonitor.c_str( ), 0, &stat );
+        
+        if ( rc == ZNONODE )
+        {
+            if (doWait == false)
+            {
+                break;
+            } 
+            sleep(ZOOKEEPER_RETRY_WAIT);
+            retries++;
+            continue;
+        }
+        else if ( rc == ZOK )
+        {
+            // Now get the master znode that registered under the masterMonitor
+            // znode.
+            //
+            // This will return one child znode for the monitor process that has
+            // registered as the current master.
+            rc = zoo_get_children( ZHandle, masterMonitor.c_str( ), 0, &nodes );
+            if ( nodes.count > 0 )
+            {
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d nodes.count=%d\n"
+                                , method_name, __LINE__
+                                , nodes.count );
+                }
+                found = true;
+            }
+            else
+            {
+                if (doWait == false)
+                {
+                    break;
+                }
+                sleep(ZOOKEEPER_RETRY_WAIT);
+                retries++;
+                continue;
+            }
+        }
+         
+        else  // error
+        { 
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d Error (MasterMonitor) MasterWaitForAndReturn() returned rc (%d), retries %d\n"
+                        , method_name, __LINE__, rc, retries );
+            }
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s], ZooExistRetry() for %s failed with error %s\n"
+                    ,  method_name, masterMonitor.c_str( ), zerror(rc));
+            mon_log_write(MON_ZCLIENT_WAITFORRETURNMASTER_1, SQ_LOG_ERR, buf);
+            break;
+        }
+    }
+         
+    //should we assert nodes.count == 1?
+    if (found)
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d (MasterMonitor) Master Monitor found (%s/%s)\n"
+                        , method_name, __LINE__, masterMonitor.c_str(), nodes.data[0] );
+        }
+        TRACE_EXIT;
+        return nodes.data[0];
+    }
+    else
+    {
+      if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d (MasterMonitor) Master Monitor NOT found\n" , method_name, __LINE__);
+        }
+    }
+
+    TRACE_EXIT;
+    return NULL;
+}
+
+int CZClient::MasterZNodeCreate(  const char *nodeName )
+{
+    const char method_name[] = "CZClient::MasterZNodeCreate";
     TRACE_ENTRY;
 
     int rc;
     int retries = 0;
-    
+
     stringstream masterpath;
     masterpath.str( "" );
-    masterpath << zkRootNode_.c_str() 
-            << zkRootNodeInstance_.c_str() 
-            << ZCLIENT_MASTER_ZNODE<< "/"
-            << nodeName;
+    masterpath << masterZNodePath_.c_str() << "/"
+               << nodeName;
             
     string monZnode = masterpath.str( );
 
@@ -992,32 +2157,27 @@
 
     if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
     {
-        trace_printf( "%s@%d RegisterZNode(%s:%s)\n"
+        trace_printf( "%s@%d ZNodeCreate(%s:%s)\n"
                     , method_name, __LINE__
                     , monZnode.c_str()
                     , monData.c_str() );
     }
 
-    rc = RegisterZNode( monZnode.c_str(), monData.c_str(), ZOO_EPHEMERAL );
+    rc = ZNodeCreate( monZnode.c_str(), monData.c_str(), ZOO_EPHEMERAL );
     while ( ((rc == ZCONNECTIONLOSS) || (rc == ZOPERATIONTIMEOUT)) && retries < ZOOKEEPER_RETRY_COUNT)
     {
         sleep(ZOOKEEPER_RETRY_WAIT);
         retries++;
-        rc = RegisterZNode( monZnode.c_str(), monData.c_str(), ZOO_EPHEMERAL );
+        rc = ZNodeCreate( monZnode.c_str(), monData.c_str(), ZOO_EPHEMERAL );
     }
     
     if (rc != ZOK)
     {
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "%s@%d Error (MasterMonitor) Create master node for %s with rc = %d)\n"
-                    , method_name, __LINE__, monZnode.c_str( ), rc);
-        }
         char buf[MON_STRING_BUF_SIZE];
         snprintf( buf, sizeof(buf)
-                , "[%s], RegisterZNode(%s) failed with error %s\n"
+                , "[%s], ZNodeCreate(%s) failed with error %s\n"
                 , method_name, monData.c_str(), zerror(rc) );
-        mon_log_write(MON_ZCLIENT_CREATEMASTERZNODE, SQ_LOG_ERR, buf);
+        mon_log_write(MON_ZCLIENT_MASTERZNODECREATE_1, SQ_LOG_ERR, buf);
 
         TRACE_EXIT;
         return(rc); // Return the error
@@ -1031,157 +2191,54 @@
     return(rc);
 }
 
-int CZClient::MakeClusterZNodes( void )
+int CZClient::MasterZNodeDelete( const char *nodeName )
 {
-    const char method_name[] = "CZClient::MakeClusterZNodes";
+    const char method_name[] = "CZClient::MasterZNodeDelete";
     TRACE_ENTRY;
+    
+    int rc = -1;
+    stringstream newpath;
+    newpath.str( "" );
+    newpath << masterZNodePath_.c_str() << "/"
+            << nodeName;
+           
+    string znode = newpath.str( );
 
-    int rc;
-    Stat stat;
-
-    stringstream ss;
-    ss.str( "" );
-    ss << zkRootNode_.c_str();
-    string rootDir( ss.str( ) );
-
-    rc = ZooExistRetry( ZHandle, rootDir.c_str(), 0, &stat );
-    switch (rc)
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
     {
-    case ZOK:
-        break;
-    case ZNONODE:
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "%s@%d RegisterZNode(%s)\n"
-                        , method_name, __LINE__ 
-                        , rootDir.c_str() );
-        }
-        rc = RegisterZNode( rootDir.c_str(), NULL, 0 );
-        if ( rc && rc != ZNODEEXISTS )
-        {
-            return(rc); // Return the error
-        }
-        rc = ZOK;
-        break;
-    default:
-        char buf[MON_STRING_BUF_SIZE];
-        snprintf( buf, sizeof(buf)
-                , "[%s], zoo_exists(%s) failed with error %s\n"
-                , method_name, rootDir.c_str(), zerror(rc) );
-        mon_log_write(MON_ZCLIENT_CHECKCLUSTERZNODES_1, SQ_LOG_ERR, buf);
-        if (rc) return(rc); // Return the error
-        break;
+        trace_printf( "%s@%d Deleting znode(%s)\n"
+                    , method_name, __LINE__
+                    , znode.c_str() );
     }
-
-    ss.str( "" );
-    ss << zkRootNode_.c_str() 
-       << zkRootNodeInstance_.c_str();
-    string instanceDir( ss.str( ) );
-
-    rc = ZooExistRetry( ZHandle, instanceDir.c_str( ), 0, &stat );
-    switch (rc)
+   
+    rc = ZNodeDelete( znode );
+    if ( rc == ZOK )
     {
-    case ZOK:
-        break;
-    case ZNONODE:
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "%s@%d RegisterZNode(%s)\n"
-                        , method_name, __LINE__
-                        , instanceDir.c_str() );
-        }
-        rc = RegisterZNode( instanceDir.c_str(), NULL, 0 );
-        if ( rc && rc != ZNODEEXISTS )
-        {
-            return(rc); // Return the error
-        }
-        rc = ZOK;
-        break;
-    default:
         char buf[MON_STRING_BUF_SIZE];
         snprintf( buf, sizeof(buf)
-                , "[%s], zoo_exists(%s) failed with error %s\n"
-                , method_name, instanceDir.c_str( ), zerror(rc) );
-        mon_log_write(MON_ZCLIENT_CHECKCLUSTERZNODES_2, SQ_LOG_ERR, buf);
-        break;
+                , "[%s], Master znode (%s) deleted!\n"
+                , method_name, nodeName );
+        mon_log_write(MON_ZCLIENT_MASTERZNODEDELETE_1, SQ_LOG_INFO, buf);
     }
-
-    ss.str( "" );
-    ss << zkRootNode_.c_str() 
-       << zkRootNodeInstance_.c_str() 
-       << ZCLIENT_CLUSTER_ZNODE;
-    string clusterDir( ss.str( ) );
-
-    rc = ZooExistRetry( ZHandle, clusterDir.c_str( ), 0, &stat );
-    switch (rc)
+    else if ( rc == ZNONODE )
     {
-    case ZOK:
-        break;
-    case ZNONODE:
+        // This is ok since we call it indiscriminately
         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
         {
-            trace_printf( "%s@%d RegisterZNode(%s)\n"
+            trace_printf( "%s@%d (MasterMonitor) Master ZNode %s already deleted\n"
                         , method_name, __LINE__
-                        , clusterDir.c_str() );
+                        , nodeName );
         }
-        rc = RegisterZNode( clusterDir.c_str(), NULL, 0 );
-        if ( rc && rc != ZNODEEXISTS )
-        {
-            return(rc); // Return the error
-        }
-        rc = ZOK;
-        break;
-    default:
-        char buf[MON_STRING_BUF_SIZE];
-        snprintf( buf, sizeof(buf)
-                , "[%s], zoo_exists(%s) failed with error %s\n"
-                , method_name, clusterDir.c_str(), zerror(rc) );
-        mon_log_write(MON_ZCLIENT_CHECKCLUSTERZNODES_3, SQ_LOG_ERR, buf);
-        break;
-    }
-
-    ss.str( "" );
-    ss << zkRootNode_.c_str() 
-       << zkRootNodeInstance_.c_str() 
-       << ZCLIENT_MASTER_ZNODE;
-    string masterDir( ss.str( ) );
-
-    rc = ZooExistRetry( ZHandle, masterDir.c_str( ), 0, &stat );
-    switch (rc)
-    {
-    case ZOK:
-        break;
-    case ZNONODE:
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "%s@%d RegisterZNode(%s)\n"
-                        , method_name, __LINE__
-                        , masterDir.c_str() );
-        }
-        rc = RegisterZNode( masterDir.c_str(), NULL, 0 );
-        if ( rc && rc != ZNODEEXISTS )
-        {
-            return(rc); // Return the error
-        }
-        rc = ZOK;
-        break;
-    default:
-        char buf[MON_STRING_BUF_SIZE];
-        snprintf( buf, sizeof(buf)
-                , "[%s], zoo_exists(%s) failed with error %s\n"
-                , method_name, masterDir.c_str(), zerror(rc) );
-        mon_log_write(MON_ZCLIENT_CHECKCLUSTERZNODES_3, SQ_LOG_ERR, buf);
-        break;
     }
     
     TRACE_EXIT;
-    return(rc);
+    return( rc );
 }
 
 // ZClient main processing loop
-void CZClient::MonitorZCluster()
+void CZClient::MonitorCluster()
 {
-    const char method_name[] = "CZClient::MonitorZCluster";
+    const char method_name[] = "CZClient::MonitorCluster";
     TRACE_ENTRY;
 
     int rc;
@@ -1195,10 +2252,10 @@
 
     if (zcMonitoringRate_ >= 0)
     {
-        SetTimeToWakeUp( timeout );
+        TimeToWakeUpSet( timeout );
     }
 
-    while ( GetState() != ZC_SHUTDOWN )
+    while ( StateGet() != ZC_SHUTDOWN )
     {
         lock();
         if ( !IsEnabled() )
@@ -1208,7 +2265,8 @@
         }
         else
         {
-            if (zcMonitoringRate_ < 0 || GetState() == ZC_DISABLED)
+
+            if (zcMonitoringRate_ < 0 || StateGet() == ZC_DISABLED)
             {
                 // Wait until signaled
                 CLock::wait();
@@ -1216,9 +2274,54 @@
                 {
                     trace_printf( "%s@%d" " - ZCluster signaled, state_=%s\n"
                                 , method_name, __LINE__
-                                , ZClientStateStr(GetState()) );
+                                , ZClientStateStr(StateGet()) );
                 }
             }
+
+            if (znodeDeletedQueue_.size() != 0)
+            {
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d - ZCluster signaling: "
+                                  "ZC_ZNODE_DELETED, znodeDeletedQueue_=%ld\n"
+                                , method_name, __LINE__
+                                , znodeDeletedQueue_.size() );
+                }
+                StateSet( ZC_ZNODE_DELETED );
+            }
+            else if (znodeChildQueue_.size() != 0)
+            {
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d - ZCluster signaling: "
+                                  "ZC_ZNODE_CHILD, znodeChildQueue_=%ld\n"
+                                , method_name, __LINE__
+                                , znodeChildQueue_.size() );
+                }
+                StateSet( ZC_ZNODE_CHILD );
+            }
+            else if (znodeCreatedQueue_.size() != 0)
+            {
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d - ZCluster signaling: "
+                                  "ZC_ZNODE_CREATED, znodeCreatedQueue_=%ld\n"
+                                , method_name, __LINE__
+                                , znodeCreatedQueue_.size() );
+                }
+                StateSet( ZC_ZNODE_CREATED );
+            }
+            else if (znodeChangedQueue_.size() != 0)
+            {
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d - ZCluster signaling: "
+                                  "ZC_ZNODE_CHANGED, znodeChangedQueue_=%ld\n"
+                                , method_name, __LINE__
+                                , znodeChangedQueue_.size() );
+                }
+                StateSet( ZC_ZNODE_CHANGED );
+            }
             else
             {
                 // Wait until signaled or timer expires
@@ -1227,7 +2330,7 @@
                 {
                     if ( rc != 0 )
                     {
-                        StopClusterMonitoring();
+                        ClusterMonitoringStop();
                     }
                     else
                     {
@@ -1235,65 +2338,85 @@
                         {
                             trace_printf( "%s@%d" " - ZCluster signaled, state_=%s\n"
                                         , method_name, __LINE__
-                                        , ZClientStateStr(GetState()) );
+                                        , ZClientStateStr(StateGet()) );
                         }
                     }
                 }
             }
         }
 
-        switch ( GetState() )
+        switch ( StateGet() )
         {
             case ZC_START:
-                StartClusterMonitoring();
+                ClusterMonitoringStart();
                 break;
             case ZC_CLUSTER:
-                if ( IsCheckCluster() )
+                if ( IsClusterWatchEnabled() )
                 {
-                    CheckCluster();
-                    if (GetState() != ZC_STOP)
+                    RunningZNodesCheck();
+                    if (StateGet() != ZC_STOP)
                     {
-                        SetState( ZC_MYZNODE );
+                        StateSet( ZC_MYZNODE );
                     }
                 }
                 break;
             case ZC_WATCH:
-                if ( !IsCheckCluster() )
+                if ( !IsClusterWatchEnabled() )
                 {
-                    WatchCluster();
-                    if (GetState() != ZC_STOP)
+                    ConfiguredZNodesWatchSet();
+                    ErrorZNodesWatchSet();
+                    RunningZNodesWatchSet();
+                    if (StateGet() != ZC_STOP)
                     {
-                        SetState( ZC_MYZNODE );
+                        ClusterWatchEnabledSet( true );
+                        StateSet( ZC_MYZNODE );
                     }
                 }
                 break;
             case ZC_MYZNODE:
-                if ( IsCheckCluster() )
+                if ( IsClusterWatchEnabled() )
                 {
-                    CheckMyZNode();
+                    MyRunningZNodeCheck();
                 }
                 break;
-            case ZC_ZNODE:
-                if ( IsCheckCluster() )
+            case ZC_ZNODE_CHANGED:
+                if ( IsClusterWatchEnabled() )
                 {
-                    HandleExpiredZNode();
-                    SetState( ZC_MYZNODE );
+                    HandleChangedZNode();
+                    StateSet( ZC_MYZNODE );
                 }
-                // we still need to check if the master went down
-                else
+                break;
+            case ZC_ZNODE_CHILD:
+                if ( IsClusterWatchEnabled() )
                 {
-                    HandleMasterZNode(); 
+                    HandleChildZNode();
+                    StateSet( ZC_MYZNODE );
+                }
+                break;
+            case ZC_ZNODE_CREATED:
+                if ( IsClusterWatchEnabled() )
+                {
+                    HandleCreatedZNode();
+                    StateSet( ZC_MYZNODE );
+                }
+                break;
+            case ZC_ZNODE_DELETED:
+                if ( IsClusterWatchEnabled() )
+                {
+                    HandleDeletedZNode();
+                    StateSet( ZC_MYZNODE );
                 }
                 break;
             case ZC_STOP:
-                StopClusterMonitoring();
+                ClusterMonitoringStop();
                 break;
             default:
                 break;
         }
+
         if (zcMonitoringRate_ >= 0)
         {
-            SetTimeToWakeUp( timeout );
+            TimeToWakeUpSet( timeout );
         }
         unlock();
     }
@@ -1307,9 +2430,74 @@
     TRACE_EXIT;
 }
 
-int CZClient::RegisterMyNodeZNode( void )
+void CZClient::MyRunningZNodeCheck( void )
 {
-    const char method_name[] = "CZClient::RegisterMyNodeZNode";
+    const char method_name[] = "CZClient::MyRunningZNodeCheck";
+    TRACE_ENTRY;
+
+    int zerr;
+    struct timespec currentTime;
+
+    if ( IsClusterWatchEnabled() )
+    {
+        if (resetMyZNodeFailedTime_)
+        {
+            resetMyZNodeFailedTime_ = false;
+            clock_gettime(CLOCK_REALTIME, &myZNodeFailedTime_);
+            myZNodeFailedTime_.tv_sec += (SessionTimeoutGet() * 2);
+#if 0
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d" " - Resetting MyZnode Fail Time %ld(secs)\n"
+                            , method_name, __LINE__
+                            , myZNodeFailedTime_.tv_sec );
+            }
+#endif
+        }
+        if (MyNode->IsPendingNodeDown())
+        {
+            return;
+        }
+        if ( ! IsRunningZNodeExpired( Node_name, zerr ) )
+        {
+            if ( zerr == ZCONNECTIONLOSS || zerr == ZOPERATIONTIMEOUT )
+            {
+                // Ignore transient errors with the quorum.
+                // However, if longer than the session
+                // timeout, handle it as a hard error.
+                clock_gettime(CLOCK_REALTIME, &currentTime);
+                if (currentTime.tv_sec > myZNodeFailedTime_.tv_sec)
+                {
+                    char buf[MON_STRING_BUF_SIZE];
+                    snprintf( buf, sizeof(buf)
+                            , "[%s], Zookeeper quorum comm error: %s - Handling my znode (%s) as expired! Node is going down.\n"
+                            , method_name, zerror(zerr), Node_name );
+                    mon_log_write(MON_ZCLIENT_MYRUNNINGZNODECHECK_1, SQ_LOG_ERR, buf);
+                    HandleMyNodeExpiration();
+                }
+            }
+            else
+            {
+                resetMyZNodeFailedTime_ = true;
+            }
+        }
+        else
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s], My znode (%s) expired! Node is going down.\n"
+                    , method_name, Node_name );
+            mon_log_write(MON_ZCLIENT_MYRUNNINGZNODECHECK_2, SQ_LOG_ERR, buf);
+            HandleMyNodeExpiration();
+        }
+    }
+    
+    TRACE_EXIT;
+}
+
+int CZClient::MyRunningZNodeCreate( void )
+{
+    const char method_name[] = "CZClient::MyRunningZNodeCreate";
     TRACE_ENTRY;
 
     int rc;
@@ -1319,9 +2507,7 @@
 
     stringstream newpath;
     newpath.str( "" );
-    newpath << zkRootNode_.c_str() 
-            << zkRootNodeInstance_.c_str() 
-            << ZCLIENT_CLUSTER_ZNODE << "/"
+    newpath << runningZNodePath_.c_str() << "/"
             << Node_name;
     string monZnode = newpath.str( );
 
@@ -1332,24 +2518,676 @@
 
     if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
     {
-        trace_printf( "%s@%d RegisterZNode(%s:%s)\n"
+        trace_printf( "%s@%d ZNodeCreate(%s:%s)\n"
                     , method_name, __LINE__
                     , monZnode.c_str()
                     , monData.c_str() );
     }
 
-    rc = RegisterZNode( monZnode.c_str(), monData.c_str(), ZOO_EPHEMERAL );
+    lock();
+    // Clean up previous error znodes
+    HandleErrorChildZNodes( Node_name );
+    unlock();
+
+    rc = ZNodeCreate( monZnode.c_str(), monData.c_str(), ZOO_EPHEMERAL );
 
     TRACE_EXIT;
-
     return(rc);
 }
 
-int CZClient::RegisterZNode( const char *znodePath
-                           , const char *znodeData
-                           , int flags )
+int CZClient::RunningZNodeDelete( const char *nodeName )
 {
-    const char method_name[] = "CZClient::RegisterZNode";
+    const char method_name[] = "CZClient::RunningZNodeDelete";
+    TRACE_ENTRY;
+
+    int rc = -1;
+
+    stringstream newpath;
+    newpath.str( "" );
+    newpath << runningZNodePath_.c_str() << "/"
+            << nodeName;
+    string monZnode = newpath.str( );
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d Deleting znode(%s)\n"
+                    , method_name, __LINE__
+                    , monZnode.c_str() );
+    }
+
+    if (strcmp( Node_name, nodeName) == 0)
+    {
+        // Clean up my error znode and children
+        HandleErrorChildZNodes( Node_name );
+        // Clean up error znodes and where I am their 'only' child
+        lock();
+        HandleErrorChildZNodesForZNodeChild( Node_name, true );
+        unlock();
+    }
+
+    rc = ZNodeDelete( monZnode );
+    if ( rc == ZOK )
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], znode (%s) deleted!\n"
+                , method_name, nodeName );
+        mon_log_write(MON_ZCLIENT_RUNZNODEWATCHDELETE_1, SQ_LOG_INFO, buf);
+    }
+
+    TRACE_EXIT;
+    return( rc );
+}
+
+int CZClient::RunningZNodeWatchAdd( const char *nodeName )
+{
+    const char method_name[] = "CZClient::RunningZNodeWatchAdd";
+    TRACE_ENTRY;
+
+    int rc;
+    stringstream newpath;
+    newpath.str( "" );
+    newpath << runningZNodePath_.c_str() << "/"
+            << nodeName;
+    string monZnode = newpath.str( );
+
+    lock();
+    rc = ZNodeWatchSet( monZnode );
+    unlock();
+    if ( rc != ZOK )
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], ZNodeWatchSet(%s) failed!\n"
+                , method_name
+                , monZnode.c_str() );
+        mon_log_write(MON_ZCLIENT_RUNZNODEWATCHADD_1, SQ_LOG_ERR, buf);
+    }
+    else
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Watch set on monZnode=%s\n"
+                        , method_name, __LINE__
+                        , monZnode.c_str() );
+        }
+    }
+
+    TRACE_EXIT;
+    return(rc);
+}
+
+int CZClient::RunningZNodeWatchDelete( const char *nodeName )
+{
+    const char method_name[] = "CZClient::RunningZNodeWatchDelete";
+    TRACE_ENTRY;
+
+    int rc = -1;
+
+    stringstream newpath;
+    newpath.str( "" );
+    newpath << runningZNodePath_.c_str() << "/"
+            << nodeName;
+    string monZnode = newpath.str( );
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d Deleting znode(%s)\n"
+                    , method_name, __LINE__
+                    , monZnode.c_str() );
+    }
+    rc = ZNodeWatchReset( monZnode );
+    if ( rc == ZOK )
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], znode (%s) deleted!\n"
+                , method_name, nodeName );
+        mon_log_write(MON_ZCLIENT_RUNZNODEWATCHDELETE_1, SQ_LOG_INFO, buf);
+    }
+
+    TRACE_EXIT;
+    return( rc );
+}
+
+void CZClient::RunningZNodesCheck( void )
+{
+    const char method_name[] = "CZClient::RunningZNodesCheck";
+    TRACE_ENTRY;
+
+    int rc;
+    struct String_vector nodes;
+
+    if ( IsClusterWatchEnabled() )
+    {
+        rc = RunningZNodesGet( &nodes );
+        if ( rc != ZOK )
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s], RunningZNodesGet() failed!\n"
+                    , method_name );
+            mon_log_write(MON_ZCLIENT_RUNZNODESCHECK_1, SQ_LOG_ERR, buf);
+            StateSet( CZClient::ZC_STOP );
+            CLock::wakeOne();
+            return;
+        }
+
+        stringstream newpath;
+        string monZnode;
+        string nodeName;
+        int    pnid = -1;
+    
+        if ( nodes.count > 0 )
+        {
+            for (int i = 0; i < nodes.count ; i++ )
+            {
+                newpath.str( "" );
+                newpath << runningZNodePath_.c_str() << "/"
+                        << nodes.data[i];
+                string monZnode = newpath.str( );
+            
+                rc = ZNodeDataGet( monZnode, nodeName, pnid );
+                if ( rc != ZOK )
+                {
+                    char buf[MON_STRING_BUF_SIZE];
+                    snprintf( buf, sizeof(buf)
+                            , "[%s], ZNodeDataGet(%s) failed!\n"
+                            , method_name
+                            , monZnode.c_str() );
+                    mon_log_write(MON_ZCLIENT_RUNZNODESCHECK_2, SQ_LOG_ERR, buf);
+                }
+                else
+                {
+                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                    {
+                        trace_printf( "%s@%d monZnode=%s, nodeName=%s, pnid=%d)\n"
+                                    , method_name, __LINE__
+                                    , monZnode.c_str(), nodeName.c_str(), pnid );
+                    }
+                }
+            }
+            FreeStringVector( &nodes );
+        }
+    }
+    else
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d ClusterWatch is NOT set!\n"
+                        , method_name, __LINE__ );
+        }
+    }
+    
+    TRACE_EXIT;
+}
+
+void CZClient::RunningZNodesDelete( void )
+{
+    const char method_name[] = "CZClient::RunningZNodesDelete";
+    TRACE_ENTRY;
+
+    int rc;
+    struct String_vector nodes;
+
+    rc = RunningZNodesGet( &nodes );
+    if ( rc != ZOK )
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], RunningZNodesGet() failed!\n"
+                , method_name );
+        mon_log_write(MON_ZCLIENT_RUNZNODESDELETE_1, SQ_LOG_ERR, buf);
+        CLock::wakeOne();
+        return;
+    }
+
+    stringstream newpath;
+    string monZnode;
+    string nodeName;
+    int    pnid = -1;
+
+    if ( nodes.count > 0 )
+    {
+        for (int i = 0; i < nodes.count ; i++ )
+        {
+            newpath.str( "" );
+            newpath << runningZNodePath_.c_str() << "/"
+                    << nodes.data[i];
+            string monZnode = newpath.str( );
+        
+            rc = ZNodeDataGet( monZnode, nodeName, pnid );
+            if ( rc != ZOK )
+            {
+                char buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf)
+                        , "[%s], ZNodeDataGet(%s) failed!\n"
+                        , method_name
+                        , monZnode.c_str() );
+                mon_log_write(MON_ZCLIENT_RUNZNODESDELETE_2, SQ_LOG_ERR, buf);
+            }
+            else
+            {
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d monZnode=%s, nodeName=%s, pnid=%d)\n"
+                                , method_name, __LINE__
+                                , monZnode.c_str(), nodeName.c_str(), pnid );
+                }
+                ZClient->RunningZNodeDelete( nodeName.c_str() );
+                ZClient->MasterZNodeDelete( nodeName.c_str() );
+            }
+        }
+        FreeStringVector( &nodes );
+    }
+
+    TRACE_EXIT;
+}
+
+int CZClient::RunningZNodesGet( String_vector *nodes )
+{
+    const char method_name[] = "CZClient::RunningZNodesGet";
+    TRACE_ENTRY;
+
+    bool found = false;
+    int rc = -1;
+    int retries = 0;
+    Stat stat;
+
+    string trafCluster( runningZNodePath_.c_str() );
+
+    nodes->count = 0;
+    nodes->data = NULL;
+
+    while ( !found )
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d trafCluster=%s\n"
+                        , method_name, __LINE__, trafCluster.c_str() );
+        }
+        // Verify the existence of the parent
+        rc = ZooExistRetry( ZHandle, trafCluster.c_str( ), 0, &stat );
+        if ( rc == ZNONODE )
+        {
+            if (retries > ZOOKEEPER_RETRY_COUNT)
+                break;
+            retries++;    
+            continue;
+        }
+        else if ( rc == ZOK )
+        {
+            // Now get the list of available znodes in the cluster.
+            //
+            // This will return child znodes for each monitor process that has
+            // registered, including this process.
+            rc = zoo_get_children( ZHandle, trafCluster.c_str( ), 0, nodes );
+            if ( nodes->count > 0 )
+            {
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d nodes.count=%d\n"
+                                , method_name, __LINE__
+                                , nodes->count );
+                }
+                found = true;
+            }
+            else
+            {
+                sleep(ZOOKEEPER_RETRY_WAIT);
+                if (retries > ZOOKEEPER_CHILD_RETRY_COUNT)
+                    break;
+                retries++;    
+                continue;
+            }
+        }
+        else  // error
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s], zoo_exists() for %s failed with error %s\n"
+                    ,  method_name, trafCluster.c_str( ), zerror(rc));
+            mon_log_write(MON_ZCLIENT_RUNZNODESGET_1, SQ_LOG_ERR, buf);
+            break;
+        }
+    }
+
+    TRACE_EXIT;
+    return( rc );
+}
+
+void CZClient::RunningZNodesWatchSet( void )
+{
+    const char method_name[] = "CZClient::RunningZNodesWatchSet";
+    TRACE_ENTRY;
+
+    int rc;
+    struct String_vector nodes;
+
+    if ( !IsClusterWatchEnabled() )
+    {
+        rc = RunningZNodesGet( &nodes );
+        if ( rc != ZOK )
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s], RunningZNodesGet() failed!\n"
+                    , method_name );
+            mon_log_write(MON_ZCLIENT_RUNZNODESWATCHSET_1, SQ_LOG_ERR, buf);
+            CLock::wakeOne();
+            return;
+        }
+
+        stringstream runningpath;
+        string runningznode;
+    
+        if ( nodes.count > 0 )
+        {
+            for (int i = 0; i < nodes.count ; i++ )
+            {
+                runningpath.str( "" );
+                runningpath << runningZNodePath_.c_str() << "/"
+                            << nodes.data[i];
+                string runningznode = runningpath.str( );
+            
+                rc = ZNodeWatchSet( runningznode );
+                if ( rc != ZOK )
+                {
+                    char buf[MON_STRING_BUF_SIZE];
+                    snprintf( buf, sizeof(buf)
+                            , "[%s], ZNodeWatchSet(%s) failed!\n"
+                            , runningznode.c_str()
+                            , method_name );
+                    mon_log_write(MON_ZCLIENT_RUNZNODESWATCHSET_2, SQ_LOG_ERR, buf);
+
+                    FreeStringVector( &nodes );
+                    TRACE_EXIT;
+                    return;
+                }
+                else
+                {
+                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                    {
+                        trace_printf( "%s@%d Watch set on monZnode=%s\n"
+                                    , method_name, __LINE__
+                                    , runningznode.c_str() );
+                    }
+                }
+            }
+            FreeStringVector( &nodes );
+        }
+    }
+    else
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Cluster watch already enabled!\n"
+                        , method_name, __LINE__ );
+        }
+    }
+    
+    TRACE_EXIT;
+}
+
+int CZClient::ShutdownWork(void)
+{
+    const char method_name[] = "CZClient::ShutdownWork";
+    TRACE_ENTRY;
+
+    // Set flag that tells the commAcceptor thread to exit
+    StateSet( ZC_SHUTDOWN );
+    CLock::wakeOne();
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d waiting for ZClient thread %lx to exit.\n"
+                    ,  method_name, __LINE__, threadId_);
+    }
+
+    // Wait for commAcceptor thread to exit
+    int rc = pthread_join( threadId_, NULL );
+    if (rc != 0)
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        int err = rc;
+        sprintf(buf, "[%s], Error= Can't join thread! - errno=%d (%s)\n", method_name, err, strerror(err));
+        mon_log_write(MON_ZCLIENT_SHUTDOWNWORK_1, SQ_LOG_ERR, buf);
+    }
+
+    TRACE_EXIT;
+    return(rc);
+}
+
+// Create the ZClientThread
+int CZClient::StartWork( void )
+{
+    const char method_name[] = "CZClient::StartWork";
+    TRACE_ENTRY;
+
+    int rc = pthread_create(&threadId_, NULL, ZClientThread, this);
+    if (rc != 0)
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf(buf, sizeof(buf), "[%s], ZClientThread create error=%d\n",
+                 method_name, rc);
+        mon_log_write(MON_ZCLIENT_STARTWORK_1, SQ_LOG_ERR, buf);
+    }
+
+    TRACE_EXIT;
+    return(rc);
+}
+
+void CZClient::StartMonitoring( void )
+{
+    const char method_name[] = "CZClient::StartMonitoring";
+    TRACE_ENTRY;
+    if (ZHandle)
+    {
+        ZClient->StateSet( CZClient::ZC_START );
+        ZClient->CLock::wakeOne();
+    }
+    TRACE_EXIT;
+}
+
+void CZClient::StateSet( ZClientState_t state )
+{ 
+    CAutoLock lock(getLocker());
+
+    if ( StateGet() != ZC_SHUTDOWN )
+    {
+        if (state == ZC_SHUTDOWN)
+        {
+            shutdown_ = true;
+        }
+        state_ = state; 
+    }
+}
+
+void CZClient::StateSet( int type, ZClientState_t state, const char *znodePath ) 
+{
+    const char method_name[] = "CZClient::StateSet";
+
+    CAutoLock lock(getLocker());
+    if ( StateGet() != ZC_SHUTDOWN )
+    {
+        StateSet( state );
+        if ( type == ZOO_CHANGED_EVENT )
+        {
+            znodeChangedQueue_.push_back( znodePath );
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d - state_=%s, "
+                              "znodeChangedQueue_=%ld\n"
+                            , method_name, __LINE__
+                            , ZClientStateStr(StateGet())
+                            , znodeChangedQueue_.size() );
+            }
+        }
+        else if ( type == ZOO_CHILD_EVENT )
+        {
+            znodeChildQueue_.push_back( znodePath );
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d - state_=%s, "
+                              "znodeChildQueue_=%ld\n"
+                            , method_name, __LINE__
+                            , ZClientStateStr(StateGet())
+                            , znodeChildQueue_.size() );
+            }
+        }
+        else if ( type == ZOO_CREATED_EVENT )
+        {
+            znodeCreatedQueue_.push_back( znodePath );
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d - state_=%s, "
+                              "znodeCreatedQueue_=%ld\n"
+                            , method_name, __LINE__
+                            , ZClientStateStr(StateGet())
+                            , znodeCreatedQueue_.size() );
+            }
+        }
+        else if ( type == ZOO_DELETED_EVENT )
+        {
+            znodeDeletedQueue_.push_back( znodePath );
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d - state_=%s, "
+                              "znodeDeletedQueue_=%ld\n"
+                            , method_name, __LINE__
+                            , ZClientStateStr(StateGet())
+                            , znodeDeletedQueue_.size() );
+            }
+        }
+        else
+        {
+            abort(); // Programmer bonehead!
+        }
+    }
+}
+
+void CZClient::StopMonitoring( void )
+{
+    const char method_name[] = "CZClient::StopMonitoring";
+    TRACE_ENTRY;
+    ZClient->StateSet( CZClient::ZC_STOP );
+    ZClient->CLock::wakeOne();
+    TRACE_EXIT;
+}
+
+char* CZClient::StrCpyLeafZNode( char* znode, const char* znodePath )
+{
+    char  pathStr[MAX_PROCESSOR_NAME] = { 0 };
+    char *tkn = NULL;
+    char *tknStart = pathStr;
+    char *tknLast = NULL;
+
+    strcpy( pathStr, znodePath );
+
+    tknStart++; // skip the first '/'
+    tkn = strtok( tknStart, "/" );
+    do
+    {
+        tknLast = tkn;
+        tkn = strtok( NULL, "/" );
+    }
+    while( tkn != NULL );
+
+    strcpy( znode, tknLast );
+
+    return( znode );
+}
+
+void CZClient::TimeToWakeUpSet( struct timespec &ts )
+{
+    const char method_name[] = "CZClient::TimeToWakeUpSet";
+    TRACE_ENTRY;
+
+    clock_gettime(CLOCK_REALTIME, &ts);
+#if 0
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+            trace_printf("%s@%d" " - Clock   time %ld(secs):%ld(nsecs)(zcMonitoringRate_=%ld)\n"
+                        , method_name, __LINE__
+                        , ts.tv_sec, ts.tv_nsec, zcMonitoringRate_);
+    }
+#endif
+
+    ts.tv_sec += zcMonitoringRate_;
+
+#if 0
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+            trace_printf("%s@%d" " - Timeout time %ld(secs):%ld(nsecs)(zcMonitoringRate_=%ld)\n"
+                        , method_name, __LINE__
+                        , ts.tv_sec, ts.tv_nsec, zcMonitoringRate_);
+    }
+#endif
+    TRACE_EXIT;
+}
+
+void CZClient::TriggerCheck( int type, const char *znodePath )
+{
+    const char method_name[] = "CZClient::TriggerCheck";
+    TRACE_ENTRY;
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d" " - type=%s, path=%s\n"
+                    , method_name, __LINE__
+                    , ZooConnectionTypeStr( type )
+                    , znodePath );
+    }
+
+    CAutoLock lock(getLocker());
+    if ( StateGet() != ZC_SHUTDOWN )
+    {
+        if ( type == ZOO_CHANGED_EVENT )
+        {
+            StateSet( type, ZC_ZNODE_CHANGED, znodePath );
+        }
+        else if ( type == ZOO_CHILD_EVENT )
+        {
+            string znode;
+            znode.assign( znodePath );
+        
+            if (configuredZNodePath_.compare( znode ) == 0)
+            {
+                // We are here due to a configured ZC_ZNODE_CHILD so reset the watch
+                ConfiguredZNodesWatchSet();
+            } 
+            else if (errorZNodePath_.compare( znode ) == 0)
+            {
+                // We are here due to an error ZC_ZNODE_CHILD so reset the watch
+                ErrorZNodesWatchSet();
+            }
+    
+            StateSet( type, ZC_ZNODE_CHILD, znodePath );
+        }
+        else if ( type == ZOO_CREATED_EVENT )
+        {
+            StateSet( type, ZC_ZNODE_CREATED, znodePath );
+        }
+        else if ( type == ZOO_DELETED_EVENT )
+        {
+            StateSet( type, ZC_ZNODE_DELETED, znodePath );
+        }
+        else if ( type == ZOO_NOTWATCHING_EVENT )
+        {
+            StateSet( ZC_CLUSTER );
+        }
+    
+        CLock::wakeOne();
+    }
+    TRACE_EXIT;
+}
+
+int CZClient::ZNodeCreate( const char *znodePath
+                         , const char *znodeData
+                         , int flags
+                         , bool existOk )
+{
+    const char method_name[] = "CZClient::ZNodeCreate";
     TRACE_ENTRY;
 
     int rc = -1;
@@ -1381,13 +3219,17 @@
                    , sizeof(realpath)-1 );
     if ( rc != ZOK )
     {
-        char buf[MON_STRING_BUF_SIZE];
-        snprintf( buf, sizeof(buf)
-                , "[%s], zoo_create(%s) failed with error %s\n"
-                , method_name
-                , zpath.c_str()
-                , zerror(rc) );
-        mon_log_write(MON_ZCLIENT_REGISTERZNODE_1, SQ_LOG_ERR, buf);
+        if ( rc != ZNODEEXISTS || 
+            (rc == ZNODEEXISTS && !existOk) )
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s], zoo_create(%s) failed with error %s\n"
+                    , method_name
+                    , zpath.c_str()
+                    , zerror(rc) );
+            mon_log_write(MON_ZCLIENT_ZNODECREATE_1, SQ_LOG_ERR, buf);
+        }
     }
     if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
     {
@@ -1398,46 +3240,13 @@
     return( rc );
 }
 
-void CZClient::SetState( ZClientState_t state, const char *znodePath ) 
+int CZClient::ZNodeDataGet( string &monZnode, string &nodeName, int &pnid )
 {
-    CAutoLock lock(getLocker());
-    state_ = state; 
-    znodeQueue_.push_back( znodePath );
-}
-
-void CZClient::SetTimeToWakeUp( struct timespec &ts )
-{
-    const char method_name[] = "CZClient::SetTimeToWakeUp";
+    const char method_name[] = "CZClient::ZNodeDataGet";
     TRACE_ENTRY;
 
-    clock_gettime(CLOCK_REALTIME, &ts);
-#if 0
-    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-    {
-            trace_printf("%s@%d" " - Clock   time %ld(secs):%ld(nsecs)(zcMonitoringRate_=%ld)\n"
-                        , method_name, __LINE__
-                        , ts.tv_sec, ts.tv_nsec, zcMonitoringRate_);
-    }
-#endif
-
-    ts.tv_sec += zcMonitoringRate_;
-
-#if 0
-    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-    {
-            trace_printf("%s@%d" " - Timeout time %ld(secs):%ld(nsecs)(zcMonitoringRate_=%ld)\n"
-                        , method_name, __LINE__
-                        , ts.tv_sec, ts.tv_nsec, zcMonitoringRate_);
-    }
-#endif
-    TRACE_EXIT;
-}
-
-int CZClient::SetZNodeWatch( string &monZnode )
-{
-    const char method_name[] = "CZClient::SetZNodeWatch";
-    TRACE_ENTRY;
-
+    char  pnidStr[8] = { 0 };
+    char *tkn = NULL;
     char  zkData[MAX_PROCESSOR_NAME];
     int   rc = -1;
     int   zkDataLen = sizeof(zkData);
@@ -1449,29 +3258,43 @@
                     , method_name, __LINE__, monZnode.c_str() );
     }
     rc = ZooExistRetry( ZHandle, monZnode.c_str( ), 0, &stat );
-    if ( rc == ZNONODE ||
-         rc == ZCONNECTIONLOSS || 
-         rc == ZOPERATIONTIMEOUT )
+    if ( rc == ZNONODE )
     {
         // return the error
         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
         {
-            trace_printf( "%s@%d monZnode=%s does not exist or "
-                          "cannot be accessed!\n"
+            trace_printf( "%s@%d monZnode=%s does not exist (ZNONODE)\n"
                         , method_name, __LINE__, monZnode.c_str() );
         }
     }
     else if ( rc == ZOK )
     {
         // Get the pnid from the data part of znode
-        rc = zoo_get( ZHandle, monZnode.c_str( ), true, zkData, &zkDataLen, &stat );
-        if ( rc != ZOK )
+        rc = zoo_get( ZHandle, monZnode.c_str( ), false, zkData, &zkDataLen, &stat );
+        if ( rc == ZOK )
+        {
+            // The first token is the node name
+            tkn = strtok( zkData, ":" );
+            if ( tkn != NULL )
+            {
+                nodeName = tkn;
+            }
+            tkn = strtok( NULL, ":" );
+            if ( tkn != NULL )
+            {
+                strcpy( pnidStr, tkn );
+                pnid = atoi( pnidStr );
+            }
+            // TODO: Save monZnode path in corresponding physical node object
+            //       to match with when ZC_NODE is triggered
+        }
+        else
         {
             char buf[MON_STRING_BUF_SIZE];
             snprintf( buf, sizeof(buf)
                     , "[%s], zoo_get() for %s failed with error %s\n"
                     ,  method_name, monZnode.c_str( ), zerror(rc));
-            mon_log_write(MON_ZCLIENT_SETZNODEWATCH_1, SQ_LOG_ERR, buf);
+            mon_log_write(MON_ZCLIENT_ZNODEDATAGET_1, SQ_LOG_ERR, buf);
         }
     }
     else
@@ -1480,475 +3303,68 @@
         snprintf( buf, sizeof(buf)
                 , "[%s], zoo_exists() for %s failed with error %s\n"
                 ,  method_name, monZnode.c_str( ), zerror(rc));
-        mon_log_write(MON_ZCLIENT_SETZNODEWATCH_1, SQ_LOG_CRIT, buf);
-        switch ( rc )
-        {
-        case ZSYSTEMERROR:
-        case ZRUNTIMEINCONSISTENCY:
-        case ZDATAINCONSISTENCY:
-        case ZMARSHALLINGERROR:
-        case ZUNIMPLEMENTED:
-        case ZBADARGUMENTS:
-        case ZINVALIDSTATE:
-        case ZSESSIONEXPIRED:
-        case ZCLOSING:
-            // Treat these error like a session expiration, since
-            // we can't communicate with quorum servers
-            HandleMyNodeExpiration();
-            break;
-        default:
-            break;
-        }
+        mon_log_write(MON_ZCLIENT_ZNODEDATAGET_2, SQ_LOG_ERR, buf);
     }
 
     TRACE_EXIT;
     return( rc );
 }
 
-void CZClient::StartClusterMonitoring( void )
+int CZClient::ZNodeDelete( string &znode )
 {
-    const char method_name[] = "CZClient::StartClusterMonitoring";
-    TRACE_ENTRY;
-
-    if ( !IsEnabled() )
-    {
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "%s@%d Cluster monitoring started!\n\n", method_name, __LINE__ );
-        }
-        SetEnabled( true );
-        SetState( ZC_WATCH );
-        CLock::wakeOne();
-    }
-
-    TRACE_EXIT;
-}
-
-void CZClient::StopClusterMonitoring( void )
-{
-    const char method_name[] = "CZClient::StopClusterMonitoring";
-    TRACE_ENTRY;
-
-    if ( IsEnabled() )
-    {
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "\n%s@%d Cluster monitoring stopped!\n", method_name, __LINE__ );
-        }
-        SetCheckCluster( false );
-        SetEnabled( false );
-        SetState( ZC_DISABLED );
-        CLock::wakeOne();
-    }
-
-    TRACE_EXIT;
-}
-
-int CZClient::ShutdownWork(void)
-{
-    const char method_name[] = "CZClient::ShutdownWork";
-    TRACE_ENTRY;
-
-    // Set flag that tells the commAcceptor thread to exit
-    SetState( ZC_SHUTDOWN );
-    CLock::wakeOne();
-
-    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-    {
-        trace_printf( "%s@%d waiting for ZClient thread %lx to exit.\n"
-                    ,  method_name, __LINE__, threadId_);
-    }
-
-    // Wait for commAcceptor thread to exit
-    int rc = pthread_join( threadId_, NULL );
-    if (rc != 0)
-    {
-        char buf[MON_STRING_BUF_SIZE];
-        int err = rc;
-        sprintf(buf, "[%s], Error= Can't join thread! - errno=%d (%s)\n", method_name, err, strerror(err));
-        mon_log_write(MON_ZCLIENT_SHUTDOWNWORK_1, SQ_LOG_ERR, buf);
-    }
-
-    TRACE_EXIT;
-    return(rc);
-}
-
-// ZClientThread main
-static void *ZClientThread(void *arg)
-{
-    const char method_name[] = "ZClientThread";
-    TRACE_ENTRY;
-
-    // Parameter passed to the thread is an instance of the CommAccept object
-    CZClient *zooClient = (CZClient *) arg;
-
-    // Mask all allowed signals 
-    sigset_t  mask;
-    sigfillset(&mask);
-    sigdelset(&mask, SIGPROF); // allows profiling such as google profiler
-    int rc = pthread_sigmask(SIG_SETMASK, &mask, NULL);
-    if (rc != 0)
-    {
-        char buf[MON_STRING_BUF_SIZE];
-        snprintf(buf, sizeof(buf), "[%s], pthread_sigmask error=%d\n",
-                 method_name, rc);
-        mon_log_write(MON_ZCLIENT_ZCLIENTTHREAD_1, SQ_LOG_ERR, buf);
-    }
-
-    // Enter thread processing loop
-    zooClient->MonitorZCluster();
-
-    TRACE_EXIT;
-    return NULL;
-}
-
-
-// Create the ZClientThread
-int CZClient::StartWork( void )
-{
-    const char method_name[] = "CZClient::StartWork";
-    TRACE_ENTRY;
-
-    int rc = pthread_create(&threadId_, NULL, ZClientThread, this);
-    if (rc != 0)
-    {
-        char buf[MON_STRING_BUF_SIZE];
-        snprintf(buf, sizeof(buf), "[%s], ZClientThread create error=%d\n",
-                 method_name, rc);
-        mon_log_write(MON_ZCLIENT_STARTWORK_1, SQ_LOG_ERR, buf);
-    }
-
-    TRACE_EXIT;
-    return(rc);
-}
-
-void CZClient::StartMonitoring( void )
-{
-    const char method_name[] = "CZClient::StartMonitoring";
-    TRACE_ENTRY;
-    if (ZHandle)
-    {
-        ZClient->SetState( CZClient::ZC_START );
-        ZClient->CLock::wakeOne();
-    }
-    TRACE_EXIT;
-}
-
-void CZClient::StopMonitoring( void )
-{
-    const char method_name[] = "CZClient::StopMonitoring";
-    TRACE_ENTRY;
-    ZClient->SetState( CZClient::ZC_STOP );
-    ZClient->CLock::wakeOne();
-    TRACE_EXIT;
-}
-
-void CZClient::TriggerCheck( int type, const char *znodePath )
-{
-    const char method_name[] = "CZClient::TriggerCheck";
+    const char method_name[] = "CZClient::ZNodeDelete";
     TRACE_ENTRY;
 
     if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
     {
-        trace_printf( "%s@%d" " - state = %s\n"
+        trace_printf( "%s@%d Deleting znode=%s\n"
                     , method_name, __LINE__
-                    , ZooConnectionTypeStr( type ) );
+                    , znode.c_str() );
     }
 
-    // Leader stuff only relevant in agenMode
-    string masterpath = zkRootNode_ + zkRootNodeInstance_ + ZCLIENT_MASTER_ZNODE;
-    std::string monZnode(znodePath);
-    std::size_t found = monZnode.find(masterpath);
-    // if it is not the master node, then call HandleNodeExpiration
-
-    if (found!=std::string::npos)
-    // zookeeper node, assume stale
-    {
-        char  nodeName[MAX_PROCESSOR_NAME] = { 0 };
-        char  tempName[MAX_PROCESSOR_NAME] = { 0 };
-        char *tkn = NULL;
-        const char *tknStart = znodePath;
-        char *tknLast = NULL;
-        tknStart++; // skip the first '/'
-        strcpy (tempName, tknStart);
-        tkn = strtok( tempName, "/" );
-        strcpy (tempName, tknStart);
-        do
-        {
-            tknLast = tkn;
-            tkn = strtok( NULL, "/" );
-        }
-        while( tkn != NULL );
-        strcpy( nodeName, tknLast );
-        HandleAssignMonitorLeader (nodeName);
-    }
-    else if ( type == ZOO_CREATED_EVENT )
-    {
-        SetState( ZC_ZNODE, znodePath );
-    }
-    else if ( type == ZOO_DELETED_EVENT )
-    {
-        SetState( ZC_ZNODE, znodePath );
-    }
-    else if ( type == ZOO_CHANGED_EVENT )
-    {
-        SetState( ZC_ZNODE, znodePath );
-    }
-    else if ( type == ZOO_CHILD_EVENT )
-    {
-        SetState( ZC_CLUSTER, znodePath );
-    }
-    else if ( type == ZOO_NOTWATCHING_EVENT )
-    {
-        SetState( ZC_CLUSTER );
-    }
-
-    CLock::wakeOne();
-    TRACE_EXIT;
-}
-
-void CZClient::WatchCluster( void )
-{
-    const char method_name[] = "CZClient::WatchCluster";
-    TRACE_ENTRY;
-
-    int rc;
-    struct String_vector nodes;
-
-    if ( !IsCheckCluster() )
-    {
-        rc = GetClusterZNodes( &nodes );
-        if ( rc != ZOK )
-        {
-            char buf[MON_STRING_BUF_SIZE];
-            snprintf( buf, sizeof(buf)
-                    , "[%s], GetClusterZNodes() failed!\n"
-                    , method_name );
-            mon_log_write(MON_ZCLIENT_WATCHCLUSTER_1, SQ_LOG_ERR, buf);
-            SetState( CZClient::ZC_STOP );
-            CLock::wakeOne();
-            return;
-        }
-
-        stringstream newpath;
-        string monZnode;
-    
-        if ( nodes.count > 0 )
-        {
-            for (int i = 0; i < nodes.count ; i++ )
-            {
-                newpath.str( "" );
-                newpath << zkRootNode_.c_str() 
-                        << zkRootNodeInstance_.c_str() 
-                        << ZCLIENT_CLUSTER_ZNODE << "/"
-                        << nodes.data[i];
-                string monZnode = newpath.str( );
-            
-                rc = SetZNodeWatch( monZnode );
-                if ( rc != ZOK )
-                {
-                    char buf[MON_STRING_BUF_SIZE];
-                    snprintf( buf, sizeof(buf)
-                            , "[%s], GetZNodeData(%s) failed!\n"
-                            , monZnode.c_str()
-                            , method_name );
-                    mon_log_write(MON_ZCLIENT_WATCHCLUSTER_2, SQ_LOG_ERR, buf);
-
-                    FreeStringVector( &nodes );
-                    TRACE_EXIT;
-                    return;
-                }
-                else
-                {
-                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-                    {
-                        trace_printf( "%s@%d Watch set on monZnode=%s\n"
-                                    , method_name, __LINE__
-                                    , monZnode.c_str() );
-                    }
-                }
-            }
-            SetCheckCluster( true );
-            FreeStringVector( &nodes );
-        }
-    }
-    else
-    {
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "%s@%d CheckCluster is NOT set!\n"
-                        , method_name, __LINE__ );
-        }
-    }
-    
-    TRACE_EXIT;
-}
-
-int CZClient::WatchMasterNode( const char *nodeName )
-{
-    const char method_name[] = "CZClient::WatchMasterNode";
-    TRACE_ENTRY;
-
-    int rc;
-    stringstream newpath;
-    newpath.str( "" );
-    newpath << zkRootNode_.c_str() 
-            << zkRootNodeInstance_.c_str() 
-            << ZCLIENT_MASTER_ZNODE << "/"
-            << nodeName;
-    string monZnode = newpath.str( );
-
-    lock();
-    rc = SetZNodeWatch( monZnode );
-    unlock();
-    if ( rc != ZOK )
-    {
-       if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "%s@%d Error (MasterMonitor) WatchMasterNode failed with rc = %d for %s\n"
-                        , method_name, __LINE__
-                        , rc
-                        , nodeName);
-        }
-        char buf[MON_STRING_BUF_SIZE];
-        snprintf( buf, sizeof(buf)
-                , "[%s], SetZNodeWatch(%s) failed!\n"
-                , method_name
-                , monZnode.c_str() );
-        mon_log_write(MON_ZCLIENT_WATCHNODE_1, SQ_LOG_ERR, buf); 
-    }
-    else
-    {
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "%s@%d (MasterMonitor) WatchMasterNode set on monZnode=%s\n"
-                        , method_name, __LINE__
-                        , monZnode.c_str() );
-        }
-    }
-
-    TRACE_EXIT;
-    return(rc);
-}
-
-int CZClient::WatchNode( const char *nodeName )
-{
-    const char method_name[] = "CZClient::WatchNode";
-    TRACE_ENTRY;
-
-    int rc;
-    stringstream newpath;
-    newpath.str( "" );
-    newpath << zkRootNode_.c_str() 
-            << zkRootNodeInstance_.c_str() 
-            << ZCLIENT_CLUSTER_ZNODE << "/"
-            << nodeName;
-    string monZnode = newpath.str( );
-
-    lock();
-    rc = SetZNodeWatch( monZnode );
-    unlock();
-    if ( rc != ZOK )
-    {
-        char buf[MON_STRING_BUF_SIZE];
-        snprintf( buf, sizeof(buf)
-                , "[%s], SetZNodeWatch(%s) failed!\n"
-                , method_name
-                , monZnode.c_str() );
-        mon_log_write(MON_ZCLIENT_WATCHNODE_1, SQ_LOG_ERR, buf);
-    }
-    else
-    {
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "%s@%d Watch set on monZnode=%s\n"
-                        , method_name, __LINE__
-                        , monZnode.c_str() );
-        }
-    }
-
-    TRACE_EXIT;
-    return(rc);
-}
-
-int CZClient::WatchNodeMasterDelete( const char *nodeName )
-{
-    const char method_name[] = "CZClient::WatchMasterDelete";
-    TRACE_ENTRY;
-    
     int rc = -1;
-    stringstream newpath;
-    newpath.str( "" );
-    newpath << zkRootNode_.c_str() 
-            << zkRootNodeInstance_.c_str() 
-            << ZCLIENT_MASTER_ZNODE <<"/"
-            << nodeName;
-           
-    string monZnode = newpath.str( );
-
-    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-    {
-        trace_printf( "%s@%d zoo_delete(%s)\n"
-                    , method_name, __LINE__
-                    , monZnode.c_str() );
-    }
-   
     rc = zoo_delete( ZHandle
-                   , monZnode.c_str( )
+                   , znode.c_str()
                    , -1 );
-    if ( rc == ZOK )
+    if ( rc == ZOK || rc == ZNONODE)
     {
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        if ( rc == ZNONODE)
         {
-            trace_printf( "%s@%d (MasterMonitor) WatchNodeMasterDelete deleted %s, with rc == ZOK\n"
-                        , method_name, __LINE__
-                        , nodeName );
+            // This is ok since we call it indiscriminately
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d znode=%s already deleted!\n"
+                            , method_name, __LINE__
+                            , znode.c_str() );
+            }
         }
-        char buf[MON_STRING_BUF_SIZE];
-        snprintf( buf, sizeof(buf)
-                , "[%s], znode (%s) deleted!\n"
-                , method_name, nodeName );
-        mon_log_write(MON_ZCLIENT_WATCHMASTERNODEDELETE_1, SQ_LOG_INFO, buf);
-    }
-    else if ( rc == ZNONODE )
-    {
-        // This is fine since we call it indiscriminately
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        else
         {
-            trace_printf( "%s@%d (MasterMonitor) WatchNodeMasterDelete already deleted %s, with rc == ZNONODE (fine)\n"
-                        , method_name, __LINE__
-                        , nodeName );
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+            {
+                trace_printf( "%s@%d znode=%s deleted!\n"
+                            , method_name, __LINE__
+                            , znode.c_str() );
+            }
         }
     }
     else if ( rc == ZCONNECTIONLOSS || 
               rc == ZOPERATIONTIMEOUT )
     {
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "%s@%d (MasterMonitor) znode (%s) already deleted or cannot be accessed, rc=%d (%s)\n"
-                        , method_name, __LINE__
-                        , nodeName, rc, zerror(rc)  );
-        }
         rc = ZOK;
         char buf[MON_STRING_BUF_SIZE];
         snprintf( buf, sizeof(buf)
-                , "[%s], znode (%s) already deleted or cannot be accessed, rc=%d (%s)\n"
-                , method_name, nodeName, rc, zerror(rc)  );
-        mon_log_write(MON_ZCLIENT_WATCHMASTERNODEDELETE_2, SQ_LOG_INFO, buf);
+                , "[%s], znode (%s) cannot be accessed!\n"
+                , method_name, znode.c_str() );
+        mon_log_write(MON_ZCLIENT_ZNODEDELETE_1, SQ_LOG_INFO, buf);
     }
     else
     {
-        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-        {
-            trace_printf( "%s@%d (MasterMonitor) WatchNodeMasterDelete deleted %s, with rc == ZOK\n"
-                        , method_name, __LINE__
-                        , nodeName );
-        }
         char buf[MON_STRING_BUF_SIZE];
         snprintf( buf, sizeof(buf)
                 , "[%s], zoo_delete(%s) failed with error %s\n"
-                , method_name, nodeName, zerror(rc) );
-        mon_log_write(MON_ZCLIENT_WATCHMASTERNODEDELETE_3, SQ_LOG_CRIT, buf);
+                , method_name, znode.c_str(), zerror(rc) );
+        mon_log_write(MON_ZCLIENT_ZNODEDELETE_1, SQ_LOG_CRIT, buf);
         switch ( rc )
         {
         case ZSYSTEMERROR:
@@ -1968,61 +3384,60 @@
             break;
         }
     }
-    
+
     TRACE_EXIT;
     return( rc );
 }
 
-int CZClient::WatchNodeDelete( const char *nodeName )
+int CZClient::ZNodeWatchReset( string &znode )
 {
-    const char method_name[] = "CZClient::WatchNodeDelete";
+    const char method_name[] = "CZClient::ZNodeWatchReset";
     TRACE_ENTRY;
 
-    int rc = -1;
-
-    stringstream newpath;
-    newpath.str( "" );
-    newpath << zkRootNode_.c_str() 
-            << zkRootNodeInstance_.c_str() 
-            << ZCLIENT_CLUSTER_ZNODE << "/"
-            << nodeName;
-    string monZnode = newpath.str( );
+    char  zkData[MAX_PROCESSOR_NAME];
+    int   rc = -1;
+    int   zkDataLen = sizeof(zkData);
+    Stat  stat;
 
     if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
     {
-        trace_printf( "%s@%d zoo_delete(%s)\n"
-                    , method_name, __LINE__
-                    , monZnode.c_str() );
+        trace_printf( "%s@%d znode=%s\n"
+                    , method_name, __LINE__, znode.c_str() );
     }
-    rc = zoo_delete( ZHandle
-                   , monZnode.c_str( )
-                   , -1 );
-    if ( rc == ZOK )
+    rc = ZooExistRetry( ZHandle, znode.c_str( ), 0, &stat );
+    if ( rc == ZNONODE ||
+         rc == ZCONNECTIONLOSS || 
+         rc == ZOPERATIONTIMEOUT )
     {
-        char buf[MON_STRING_BUF_SIZE];
-        snprintf( buf, sizeof(buf)
-                , "[%s], znode (%s) deleted!\n"
-                , method_name, nodeName );
-        mon_log_write(MON_ZCLIENT_WATCHNODEDELETE_1, SQ_LOG_INFO, buf);
+        // return the error
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d znode=%s does not exist or "
+                          "cannot be accessed!\n"
+                        , method_name, __LINE__, znode.c_str() );
+        }
     }
-    else if ( rc == ZNONODE ||
-              rc == ZCONNECTIONLOSS || 
-              rc == ZOPERATIONTIMEOUT )
+    else if ( rc == ZOK )
     {
-        rc = ZOK;
-        char buf[MON_STRING_BUF_SIZE];
-        snprintf( buf, sizeof(buf)
-                , "[%s], znode (%s) already deleted or cannot be accessed!\n"
-                , method_name, nodeName );
-        mon_log_write(MON_ZCLIENT_WATCHNODEDELETE_2, SQ_LOG_INFO, buf);
+        // Reset a watch on monZode
+        int watch = 0;
+        rc = zoo_get( ZHandle, znode.c_str( ), watch, zkData, &zkDataLen, &stat );
+        if ( rc != ZOK )
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s], zoo_get() for %s failed with error %s\n"
+                    ,  method_name, znode.c_str( ), zerror(rc));
+            mon_log_write(MON_ZCLIENT_ZNODEWATCHRESET_1, SQ_LOG_ERR, buf);
+        }
     }
     else
     {
         char buf[MON_STRING_BUF_SIZE];
         snprintf( buf, sizeof(buf)
-                , "[%s], zoo_delete(%s) failed with error %s\n"
-                , method_name, nodeName, zerror(rc) );
-        mon_log_write(MON_ZCLIENT_WATCHNODEDELETE_3, SQ_LOG_CRIT, buf);
+                , "[%s], zoo_exists() for %s failed with error %s\n"
+                ,  method_name, znode.c_str( ), zerror(rc));
+        mon_log_write(MON_ZCLIENT_ZNODEWATCHRESET_2, SQ_LOG_CRIT, buf);
         switch ( rc )
         {
         case ZSYSTEMERROR:
@@ -2047,3 +3462,457 @@
     return( rc );
 }
 
+int CZClient::ZNodeWatchSet( string &znode )
+{
+    const char method_name[] = "CZClient::ZNodeWatchSet";
+    TRACE_ENTRY;
+
+    char  zkData[MAX_PROCESSOR_NAME];
+    int   rc = -1;
+    int   zkDataLen = sizeof(zkData);
+    Stat  stat;
+
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d znode=%s\n"
+                    , method_name, __LINE__, znode.c_str() );
+    }
+    rc = ZooExistRetry( ZHandle, znode.c_str( ), 0, &stat );
+    if ( rc == ZNONODE ||
+         rc == ZCONNECTIONLOSS || 
+         rc == ZOPERATIONTIMEOUT )
+    {
+        // return the error
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d znode=%s does not exist or "
+                          "cannot be accessed!\n"
+                        , method_name, __LINE__, znode.c_str() );
+        }
+    }
+    else if ( rc == ZOK )
+    {
+        // Set a watch on monZode
+        int watch = 1;
+        rc = zoo_get( ZHandle, znode.c_str( ), watch, zkData, &zkDataLen, &stat );
+        if ( rc != ZOK )
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s], zoo_get() for %s failed with error %s\n"
+                    ,  method_name, znode.c_str( ), zerror(rc));
+            mon_log_write(MON_ZCLIENT_ZNODEWATCHSET_1, SQ_LOG_ERR, buf);
+        }
+    }
+    else
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], zoo_exists() for %s failed with error %s\n"
+                ,  method_name, znode.c_str( ), zerror(rc));
+        mon_log_write(MON_ZCLIENT_ZNODEWATCHSET_2, SQ_LOG_CRIT, buf);
+        switch ( rc )
+        {
+        case ZSYSTEMERROR:
+        case ZRUNTIMEINCONSISTENCY:
+        case ZDATAINCONSISTENCY:
+        case ZMARSHALLINGERROR:
+        case ZUNIMPLEMENTED:
+        case ZBADARGUMENTS:
+        case ZINVALIDSTATE:
+        case ZSESSIONEXPIRED:
+        case ZCLOSING:
+            // Treat these error like a session expiration, since
+            // we can't communicate with quorum servers
+            HandleMyNodeExpiration();
+            break;
+        default:
+            break;
+        }
+    }
+
+    TRACE_EXIT;
+    return( rc );
+}
+
+int CZClient::ZNodeWatchChildSet( string &parentznode )
+{
+    const char method_name[] = "CZClient::ZNodeWatchChildSet";
+    TRACE_ENTRY;
+
+    bool found = false;
+    int rc = -1;
+    int retries = 0;
+    int watch = 1;
+    Stat stat;
+    struct String_vector nodes;
+
+    nodes.count = 0;
+    nodes.data = NULL;
+
+    while ( !found )
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d parentznode=%s\n"
+                        , method_name, __LINE__, parentznode.c_str() );
+        }
+        // Verify the existence of the parent
+        rc = ZooExistRetry( ZHandle, parentznode.c_str( ), 0, &stat );
+        if ( rc == ZNONODE )
+        {
+            if (retries > 10)
+                break;
+            retries++;    
+            continue;
+        }
+        else if ( rc == ZOK )
+        {
+            // Now get the list of available znodes in the cluster.
+            //
+            // This will return child znodes for each monitor process that has
+            // registered, including this process.
+            rc = zoo_get_children( ZHandle, parentznode.c_str( ), watch, &nodes );
+            if ( rc == ZOK )
+            {
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+                {
+                    trace_printf( "%s@%d nodes.count=%d\n"
+                                , method_name, __LINE__
+                                , nodes.count );
+                }
+                FreeStringVector( &nodes );
+                found = true;
+            }
+            else
+            {
+                char buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf)
+                        , "[%s], zoo_get_children(%s) failed with error %s\n"
+                        ,  method_name, parentznode.c_str( ), zerror(rc));
+                mon_log_write(MON_ZCLIENT_ZNODEWATCHCHILDSET_1, SQ_LOG_ERR, buf);
+                break;
+            }
+        }
+        else  // error
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s], zoo_exists() for %s failed with error %s\n"
+                    ,  method_name, parentznode.c_str( ), zerror(rc));
+            mon_log_write(MON_ZCLIENT_ZNODEWATCHCHILDSET_2, SQ_LOG_ERR, buf);
+            break;
+        }
+    }
+
+    TRACE_EXIT;
+    return( rc );
+}
+
+int CZClient::ZNodesTreeCreate( void )
+{
+    const char method_name[] = "CZClient::ZNodesTreeCreate";
+    TRACE_ENTRY;
+
+    int rc;
+    Stat stat;
+
+    stringstream ss;
+    ss.str( "" );
+    ss << zkRootNode_.c_str();
+    string rootDir( ss.str( ) );
+
+    rc = ZooExistRetry( ZHandle, rootDir.c_str(), 0, &stat );
+    switch (rc)
+    {
+    case ZOK:
+        break;
+    case ZNONODE:
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d ZNodeCreate(%s)\n"
+                        , method_name, __LINE__ 
+                        , rootDir.c_str() );
+        }
+        rc = ZNodeCreate( rootDir.c_str(), NULL, 0 );
+        if ( rc && rc != ZNODEEXISTS )
+        {
+            return(rc); // Return the error
+        }
+        rc = ZOK;
+        break;
+    default:
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], zoo_exists(%s) failed with error %s\n"
+                , method_name, rootDir.c_str(), zerror(rc) );
+        mon_log_write(MON_ZCLIENT_ZNODESTREECREATE_1, SQ_LOG_ERR, buf);
+        if (rc) return(rc); // Return the error
+        break;
+    }
+
+    ss.str( "" );
+    ss << zkRootNode_.c_str() 
+       << zkRootNodeInstance_.c_str();
+    string instanceDir( ss.str( ) );
+
+    rc = ZooExistRetry( ZHandle, instanceDir.c_str( ), 0, &stat );
+    switch (rc)
+    {
+    case ZOK:
+        break;
+    case ZNONODE:
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d ZNodeCreate(%s)\n"
+                        , method_name, __LINE__
+                        , instanceDir.c_str() );
+        }
+        rc = ZNodeCreate( instanceDir.c_str(), NULL, 0 );
+        if ( rc && rc != ZNODEEXISTS )
+        {
+            return(rc); // Return the error
+        }
+        rc = ZOK;
+        break;
+    default:
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], zoo_exists(%s) failed with error %s\n"
+                , method_name, instanceDir.c_str( ), zerror(rc) );
+        mon_log_write(MON_ZCLIENT_ZNODESTREECREATE_2, SQ_LOG_ERR, buf);
+        break;
+    }
+
+    ss.str( "" );
+    ss << zkRootNode_.c_str() 
+       << zkRootNodeInstance_.c_str() 
+       << ZCLIENT_CLUSTER_ZNODE;
+    clusterZNodePath_ = ss.str();
+
+    rc = ZooExistRetry( ZHandle, clusterZNodePath_.c_str( ), 0, &stat );
+    switch (rc)
+    {
+    case ZOK:
+        break;
+    case ZNONODE:
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d ZNodeCreate(%s)\n"
+                        , method_name, __LINE__
+                        , clusterZNodePath_.c_str() );
+        }
+        rc = ZNodeCreate( clusterZNodePath_.c_str(), NULL, 0 );
+        if ( rc && rc != ZNODEEXISTS )
+        {
+            return(rc); // Return the error
+        }
+        rc = ZOK;
+        break;
+    default:
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], zoo_exists(%s) failed with error %s\n"
+                , method_name, clusterZNodePath_.c_str(), zerror(rc) );
+        mon_log_write(MON_ZCLIENT_ZNODESTREECREATE_3, SQ_LOG_ERR, buf);
+        break;
+    }
+
+    ss.str( "" );
+    ss << zkRootNode_.c_str() 
+       << zkRootNodeInstance_.c_str() 
+       << ZCLIENT_CLUSTER_ZNODE
+       << ZCLIENT_CONFIGURED_ZNODE;
+    configuredZNodePath_ = ss.str();
+
+    rc = ZooExistRetry( ZHandle, configuredZNodePath_.c_str( ), 0, &stat );
+    switch (rc)
+    {
+    case ZOK:
+        break;
+    case ZNONODE:
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d ZNodeCreate(%s)\n"
+                        , method_name, __LINE__
+                        , configuredZNodePath_.c_str() );
+        }
+        rc = ZNodeCreate( configuredZNodePath_.c_str(), NULL, 0 );
+        if ( rc && rc != ZNODEEXISTS )
+        {
+            return(rc); // Return the error
+        }
+        rc = ZOK;
+        break;
+    default:
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], zoo_exists(%s) failed with error %s\n"
+                , method_name, configuredZNodePath_.c_str(), zerror(rc) );
+        mon_log_write(MON_ZCLIENT_ZNODESTREECREATE_4, SQ_LOG_ERR, buf);
+        break;
+    }
+
+    ss.str( "" );
+    ss << zkRootNode_.c_str() 
+       << zkRootNodeInstance_.c_str() 
+       << ZCLIENT_CLUSTER_ZNODE
+       << ZCLIENT_ERROR_ZNODE;
+    errorZNodePath_ = ss.str();
+
+    rc = ZooExistRetry( ZHandle, errorZNodePath_.c_str( ), 0, &stat );
+    switch (rc)
+    {
+    case ZOK:
+        break;
+    case ZNONODE:
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d ZNodeCreate(%s)\n"
+                        , method_name, __LINE__
+                        , errorZNodePath_.c_str() );
+        }
+        rc = ZNodeCreate( errorZNodePath_.c_str(), NULL, 0 );
+        if ( rc && rc != ZNODEEXISTS )
+        {
+            return(rc); // Return the error
+        }
+        rc = ZOK;
+        break;
+    default:
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], zoo_exists(%s) failed with error %s\n"
+                , method_name, errorZNodePath_.c_str(), zerror(rc) );
+        mon_log_write(MON_ZCLIENT_ZNODESTREECREATE_6, SQ_LOG_ERR, buf);
+        break;
+    }
+
+    ss.str( "" );
+    ss << zkRootNode_.c_str() 
+       << zkRootNodeInstance_.c_str() 
+       << ZCLIENT_CLUSTER_ZNODE
+       << ZCLIENT_RUNNING_ZNODE;
+    runningZNodePath_ = ss.str();
+
+    rc = ZooExistRetry( ZHandle, runningZNodePath_.c_str( ), 0, &stat );
+    switch (rc)
+    {
+    case ZOK:
+        break;
+    case ZNONODE:
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d ZNodeCreate(%s)\n"
+                        , method_name, __LINE__
+                        , runningZNodePath_.c_str() );
+        }
+        rc = ZNodeCreate( runningZNodePath_.c_str(), NULL, 0 );
+        if ( rc && rc != ZNODEEXISTS )
+        {
+            return(rc); // Return the error
+        }
+        rc = ZOK;
+        break;
+    default:
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], zoo_exists(%s) failed with error %s\n"
+                , method_name, runningZNodePath_.c_str(), zerror(rc) );
+        mon_log_write(MON_ZCLIENT_ZNODESTREECREATE_5, SQ_LOG_ERR, buf);
+        break;
+    }
+
+    ss.str( "" );
+    ss << zkRootNode_.c_str() 
+       << zkRootNodeInstance_.c_str() 
+       << ZCLIENT_MONITOR_ZNODE;
+    monitorZNodePath_ = ss.str();
+
+    rc = ZooExistRetry( ZHandle, monitorZNodePath_.c_str( ), 0, &stat );
+    switch (rc)
+    {
+    case ZOK:
+        break;
+    case ZNONODE:
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d ZNodeCreate(%s)\n"
+                        , method_name, __LINE__
+                        , monitorZNodePath_.c_str() );
+        }
+        rc = ZNodeCreate( monitorZNodePath_.c_str(), NULL, 0 );
+        if ( rc && rc != ZNODEEXISTS )
+        {
+            return(rc); // Return the error
+        }
+        rc = ZOK;
+        break;
+    default:
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], zoo_exists(%s) failed with error %s\n"
+                , method_name, monitorZNodePath_.c_str(), zerror(rc) );
+        mon_log_write(MON_ZCLIENT_ZNODESTREECREATE_7, SQ_LOG_ERR, buf);
+        break;
+    }
+
+    ss.str( "" );
+    ss << zkRootNode_.c_str() 
+       << zkRootNodeInstance_.c_str() 
+       << ZCLIENT_MONITOR_ZNODE
+       << ZCLIENT_MASTER_ZNODE;
+    string masterDir( ss.str( ) );
+    masterZNodePath_ = ss.str();
+
+    rc = ZooExistRetry( ZHandle, masterZNodePath_.c_str( ), 0, &stat );
+    switch (rc)
+    {
+    case ZOK:
+        break;
+    case ZNONODE:
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d Invoking ZNodeCreate(%s)\n"
+                        , method_name, __LINE__
+                        , masterZNodePath_.c_str() );
+        }
+        rc = ZNodeCreate( masterZNodePath_.c_str(), NULL, 0 );
+        if ( rc && rc != ZNODEEXISTS )
+        {
+            return(rc); // Return the error
+        }
+        rc = ZOK;
+        break;
+    default:
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], zoo_exists(%s) failed with error %s\n"
+                , method_name, masterZNodePath_.c_str(), zerror(rc) );
+        mon_log_write(MON_ZCLIENT_ZNODESTREECREATE_8, SQ_LOG_ERR, buf);
+        break;
+    }
+    
+    TRACE_EXIT;
+    return(rc);
+}
+
+int CZClient::ZooExistRetry(zhandle_t *zh, const char *path, int watch, struct Stat *stat)
+{
+    int retries = 0;
+    int rc;
+    rc = zoo_exists(zh, path, watch, stat);
+
+    // retry when loss zconnection or timeout, this may be caused by one zookeeper server down
+    while ( (rc == ZCONNECTIONLOSS
+          || rc == ZOPERATIONTIMEOUT
+          || rc == ZSESSIONMOVED)
+         && retries < ZOOKEEPER_RETRY_COUNT)
+    {
+        sleep(ZOOKEEPER_RETRY_WAIT);
+        retries++;
+        rc = zoo_exists(zh, path, watch, stat);
+    }
+    return rc;
+}
+
diff --git a/core/sqf/monitor/linux/zclient.h b/core/sqf/monitor/linux/zclient.h
index 22cf730..46d0c45 100644
--- a/core/sqf/monitor/linux/zclient.h
+++ b/core/sqf/monitor/linux/zclient.h
@@ -1,4 +1,4 @@
-/**********************************************************************
+/*********************************************************************
 // @@@ START COPYRIGHT @@@
 //
 // Licensed to the Apache Software Foundation (ASF) under one
@@ -30,33 +30,32 @@
 //  CZClient::StartWork() and CZClient::ShutdownWork() manage ZCLientThread
 //  creation and deletion.
 //
-//      CZClient::StartMonitoring()
+//      CZClient::StartWork()
 //              |
 //          pthread_create(ZClientThread)
 //              |
 //          ZC_DISABLED
 //              |
-//          CZClient::MonitorZCluster()
+//          CZClient::MonitorCluster()
 //      
-//  CZClient::MonitorZCluster() is the thread main, a state machine:
+//  CZClient::MonitorCluster() is the thread main, a state machine:
 //      
 //                       CZClient::StartMonitoring()
 //                               |
 //                           ZC_START
 //                               |
-//                       CZClient::StartClusterMonitoring()
+//                       CZClient::ClusterMonitoringStart()
 //                               |
 //                           ZC_WATCH
 //                               |
-//                       CZClient::WatchCluster()
+//                       CZClient::RunningZNodesWatchSet()
 //                               |
 //                           ZC_MYZNODE <------------------|
 //                               |                         |
-//                       CZClient::CheckMyZNode()          |
+//                       CZClient::MyRunningZNodeCheck()   |
 //                               |                         |
 //                               |-------------------------|
 //                                                         |
-//  ZOO_CHILD_EVENT                                        |
 //  ZOO_NOTWATCHING_EVENT                                  |
 //            |                                            |
 //    CZClient::TriggerCheck()---|                         |
@@ -66,15 +65,44 @@
 //                       CZClient::CheckCluster()          |
 //                               |                         |
 //                               |-------------------------|
-//  ZOO_CREATED_EVENT                                      |
-//  ZOO_DELETED_EVENT                                      |
+//                                                         |
 //  ZOO_CHANGED_EVENT                                      |
 //            |                                            |
 //    CZClient::TriggerCheck()---|                         |
 //                               |                         |
-//                           ZC_ZNODE                      |
+//                           ZC_ZNODE_CHANGED              |
 //                               |                         |
-//                       CZClient::HandleExpiredZNode()    |
+//                       CZClient::HandleChangedZNode()    |
+//                               |                         |
+//                               |-------------------------|
+//                                                         |
+//  ZOO_CHILD_EVENT                                        |
+//            |                                            |
+//    CZClient::TriggerCheck()---|                         |
+//                               |                         |
+//                           ZC_ZNODE_CHILD                |
+//                               |                         |
+//                       CZClient::HandleChildZNode()      |
+//                               |                         |
+//                               |-------------------------|
+//                                                         |
+//  ZOO_CREATED_EVENT                                      |
+//            |                                            |
+//    CZClient::TriggerCheck()---|                         |
+//                               |                         |
+//                           ZC_ZNODE_CREATED              |
+//                               |                         |
+//                       CZClient::HandleCreatedZNode()    |
+//                               |                         |
+//                               |-------------------------|
+//                                                         |
+//  ZOO_DELETED_EVENT                                      |
+//            |                                            |
+//    CZClient::TriggerCheck()---|                         |
+//                               |                         |
+//                           ZC_ZNODE_DELETED              |
+//                               |                         |
+//                       CZClient::HandleDeletedZNode()    |
 //                               |                         |
 //                               |-------------------------|
 //                 
@@ -82,7 +110,7 @@
 //                               |
 //                           ZC_STOP
 //                               |
-//                       CZClient::StopClusterMonitoring()
+//                       CZClient::ClusterMonitoringStop()
 //                               |
 //                           ZC_DISABLED
 //
@@ -102,24 +130,50 @@
 
 using namespace std;
 
-#define ZCLIENT_TRAFODION_ZNODE     "/trafodion"
-#define ZCLIENT_INSTANCE_ZNODE      "/instance"
-#ifdef NAMESERVER_PROCESS
-#define ZCLIENT_MASTER_ZNODE        "/nsmaster"
+// The following is the znode directory hierarchy:
+//      ZCLIENT_TRAFODION_ZNODE "/$TRAF_ROOT_ZNODE"
+//      ZCLIENT_INSTANCE_ZNODE      "/$TRAF_INSTANCE_ID"
+#define ZCLIENT_CLUSTER_ZNODE           "/cluster"
+#define ZCLIENT_CONFIGURED_ZNODE            "/configured"
+#define ZCLIENT_ERROR_ZNODE                 "/error"
+#define ZCLIENT_RUNNING_ZNODE               "/running"
+#define ZCLIENT_MONITOR_ZNODE           "/monitor"
+#ifndef NAMESERVER_PROCESS
+#define ZCLIENT_MASTER_ZNODE                "/master"
 #else
-#define ZCLIENT_MASTER_ZNODE        "/master"
+#define ZCLIENT_MASTER_ZNODE                "/nsmaster"
 #endif
+// Usage:
+//   ZCLIENT_ERROR_ZNODE - to determine when a non-communicative node should be declared down
+//    /error
+//       /<error-znode>      - node with communication problem if child count > 0
+//          /<znode>         - node that has problem communicating with <error-znode>
+//             o when more than one node has problem with <error-znode> 
+//               the <error-znode>'s /running znode is deleted triggering node down
+//               and the node down processing will delete the <error-znode> and child <znode>s
+//   ZCLIENT_CONFIGURED_ZNODE - to determine node when a node is added/deleted from configuration
+//    /configured
+//       /<znode>      - node in static configuration
+//   ZCLIENT_RUNNING_ZNODE - to determine operational status (node up/down)
+//    /running
+//       /<znode>      - node operational, i.e., monitor is running
+
+#define ZCLIENT_MASTER_ZNODE_RETRY_COUNT 60
 
 typedef list<string>    ZNodeList_t;
 
-// The following two functions must be implemented in the calling program.
+// The following functions must be implemented in the calling program.
 // - HandleMyNodeExpiration() is invoked when the monitor's session expires, or
-//   the monitor's znode expires or quorum communication fails
+//                            the monitor's znode expires or
+//                            quorum communication fails
 // - HandleNodeExpiration(nodeName) is invoked when the znode associated with
-//   the nodeName passed in expires.
+//                                  the nodeName passed in expires.
 extern void HandleMyNodeExpiration( void );
+extern void HandleNodeChange( const char *nodeName );
+extern void HandleNodeConfigurationChange( void );
+extern void HandleNodeCreated( const char *nodeName );
+extern void HandleNodeError( const char *nodeName );
 extern void HandleNodeExpiration( const char *nodeName );
-extern void HandleAssignMonitorLeader ( const char* failedMaster );
 
 class CZClient : public CLock
 {
@@ -131,8 +185,11 @@
         ZC_DISABLED=0,    // initial state
         ZC_START,         // start monitoring
         ZC_WATCH,         // set cluster watchers
-        ZC_CLUSTER,       // check cluster
-        ZC_ZNODE,         // check znode
+        ZC_CLUSTER,       // check all cluster znodes
+        ZC_ZNODE_CHANGED, // check znode change
+        ZC_ZNODE_CHILD,   // check znode child change
+        ZC_ZNODE_CREATED, // check znode created
+        ZC_ZNODE_DELETED, // check znode delete
         ZC_MYZNODE,       // check this monitor's znode
         ZC_STOP,          // stop monitoring
         ZC_SHUTDOWN       // thread exit 
@@ -142,55 +199,88 @@
             , const char *rootZNode
             , const char *instanceZNode );
     ~CZClient( void );
-
-    int     CreateMasterZNode(  const char *nodeName );
-    int     GetSessionTimeout( void) { return( zkSessionTimeout_ ); }
-    bool    IsZNodeExpired( const char *nodeName, int &zerr );
-    void    MonitorZCluster( void );
-    void    SetCheckCluster( bool checkCluster ) { CAutoLock lock(getLocker()); checkCluster_ = checkCluster; }
-    void    SetState( ZClientState_t state ) { CAutoLock lock(getLocker()); state_ = state; }
-    void    SetState( ZClientState_t state, const char *znodePath );
+    
+    void    ClusterWatchEnabledSet( bool enabled ) { CAutoLock lock(getLocker()); clusterWatchEnabled_ = enabled; }
+    int     ConfiguredZNodeCreate( const char *nodeName );
+    int     ConfiguredZNodeDelete( const char *nodeName );
+    int     ConfiguredZNodeWatchAdd( void );
+    int     ConfiguredZNodeWatchDelete( void );
+    void    ConfiguredZNodesDelete( void );
+    int     ConfiguredZNodesGet( String_vector *children );
+    int     ErrorZNodeCreate( const char *errorNode );
+    int     ErrorZNodeWatchAdd( void );
+    int     ErrorZNodeWatchDelete( void );
+    void    ErrorZNodesDelete( void );
+    int     ErrorZNodesGet( String_vector *children, bool doRetries=true );
+    int     ErrorZNodesGetChild( const char *errorNode, String_vector *children );
+    void    HandleErrorChildZNodesForZNodeChild( const char *childNode, bool doRetries=false );
+    bool    IsRunningZNodeExpired( const char *nodeName, int &zerr );
+    const char* MasterWaitForAndReturn( bool doWait );
+    int     MasterZNodeCreate( const char *nodeName );
+    int     MasterZNodeDelete( const char *nodeName );
+    void    MonitorCluster( void );
+    int     RunningZNodeDelete( const char *nodeName );
+    int     RunningZNodeWatchAdd( const char *nodeName );
+    void    RunningZNodesDelete( void );
+    int     SessionTimeoutGet( void) { return( zkSessionTimeout_ ); }
+    ZClientState_t StateGet( void ) { CAutoLock lock(getLocker()); return( shutdown_?ZC_SHUTDOWN:state_ ); }
+    void    StateSet( ZClientState_t state );
+    void    StateSet( int type, ZClientState_t state, const char *znodePath );
     int     ShutdownWork( void );
     void    StartMonitoring( void );
     int     StartWork( void );
     void    StopMonitoring( void );
     void    TriggerCheck( int type, const char *znodePath );
-    const char* WaitForAndReturnMaster( bool doWait );
-    int     WatchNode( const char *nodeName );
-    int     WatchMasterNode( const char *nodeName );
-    int     WatchNodeDelete( const char *nodeName );
-    int     WatchNodeMasterDelete( const char *nodeName );
-
+    
 private:
-    int     ZooExistRetry(zhandle_t *zh, const char *path, int watch, struct Stat *stat);
-    void    CheckCluster( void );
-    void    CheckMyZNode( void );
-    int     GetClusterZNodes( String_vector *children );
-    int     GetZNodeData( string &monZnode, string &nodeName, int &pnid );
-    ZClientState_t GetState( void ) { CAutoLock lock(getLocker()); return( state_ ); }
-    void    HandleExpiredZNode( void );
-    void    HandleMasterZNode ( void );
+    void    ClusterMonitoringStart( void );
+    void    ClusterMonitoringStop( void );
+    void    ConfiguredZNodesWatchSet( void );
+    void    EnabledSet( bool enabled ) { CAutoLock lock(getLocker()); enabled_ = enabled; }
+    int     ErrorZNodeDelete( const char *errorNode, String_vector *errorChildNodes );
+    int     ErrorChildZNodeDelete( const char *errorNode
+                                 , const char *errorChildNode
+                                 , String_vector *errorChildNodes );
+    void    ErrorZNodesWatchSet( void );
+    void    HandleChangedZNode( void );
+    void    HandleChildZNode( void );
+    void    HandleConfiguredZNodes( void );
+    void    HandleCreatedZNode( void );
+    void    HandleDeletedZNode( void );
+    void    HandleErrorZNode( const char *errorNode, const char *childNode );
+    void    HandleErrorZNodes( void );
+    void    HandleErrorChildZNodes( const char *errorNode );
     int     InitializeZClient( void );
+    bool    IsClusterWatchEnabled( void ) { CAutoLock lock(getLocker()); return( clusterWatchEnabled_ ); }
     bool    IsEnabled( void ) { CAutoLock lock(getLocker()); return( enabled_ ); }
-    bool    IsCheckCluster( void ) { CAutoLock lock(getLocker()); return( checkCluster_ ); }
-    int     MakeClusterZNodes( void );
-    int     RegisterMyNodeZNode( void );
-    int     RegisterZNode( const char *znodePath
-                         , const char *znodeData
-                         , int flags );
-    void    SetEnabled( bool enabled ) { CAutoLock lock(getLocker()); enabled_ = enabled; }
-    void    SetTimeToWakeUp( struct timespec &ts);
-    int     SetZNodeWatch( string &monZnode );
-    void    StartClusterMonitoring( void );
-    void    StopClusterMonitoring( void );
-    void    WatchCluster( void );
+    bool    IsZNodeMaster( const char *nodeName );
+    void    MyRunningZNodeCheck( void );
+    int     MyRunningZNodeCreate( void );
+    int     RunningZNodeWatchDelete( const char *nodeName );
+    void    RunningZNodesCheck( void );
+    int     RunningZNodesGet( String_vector *children );
+    void    RunningZNodesWatchSet( void );
+    char*   StrCpyLeafZNode( char* znode, const char* znodePath );
+    void    TimeToWakeUpSet( struct timespec &ts);
+    int     ZNodeCreate( const char *znodePath
+                       , const char *znodeData
+                       , int flags 
+                       , bool existOk=false );
+    int     ZNodeDataGet( string &monZnode, string &nodeName, int &pnid );
+    int     ZNodeDelete( string &znode );
+    int     ZNodeWatchReset( string &monZnode );
+    int     ZNodeWatchSet( string &monZnode );
+    int     ZNodeWatchChildSet( string &parentznode );
+    int     ZNodesTreeCreate( void );
+    int     ZooExistRetry(zhandle_t *zh, const char *path, int watch, struct Stat *stat);
 
     pthread_t       threadId_;
 
     ZClientState_t  state_;        // Physical node's current operating state
     bool            enabled_;      // true when cluster monitoring enabled
-    bool            checkCluster_; // true when cluster monitoring enabled
+    bool            clusterWatchEnabled_; // true when cluster monitoring enabled
     bool            resetMyZNodeFailedTime_; // set to trigger fail time reset
+    bool            shutdown_;     // set to terminate all process and exit thread
     long            zcMonitoringRate_; // in seconds
 
     string          zkQuorumHosts_;
@@ -200,7 +290,17 @@
     int             zkSessionTimeout_;
     struct timespec myZNodeFailedTime_;
     
-    ZNodeList_t     znodeQueue_;
+    string          clusterZNodePath_;
+    string          configuredZNodePath_;
+    string          errorZNodePath_;
+    string          masterZNodePath_;
+    string          monitorZNodePath_;
+    string          runningZNodePath_;
+    
+    ZNodeList_t     znodeChangedQueue_;
+    ZNodeList_t     znodeChildQueue_;
+    ZNodeList_t     znodeCreatedQueue_;
+    ZNodeList_t     znodeDeletedQueue_;
 };
 
 #endif // ZCLIENT_H_
diff --git a/core/sqf/monitor/linux/zootest.cxx b/core/sqf/monitor/linux/zootest.cxx
index 4b7a11f..f7568b4 100644
--- a/core/sqf/monitor/linux/zootest.cxx
+++ b/core/sqf/monitor/linux/zootest.cxx
@@ -36,6 +36,7 @@
 #include "msgdef.h"
 #include "montrace.h"
 #include "monlogging.h"
+#include "pnode.h"
 #include "zookeeper/zookeeper.h"
 #include "zclient.h"
 #include "zootest.h"
@@ -44,6 +45,8 @@
 
 bool debugFlag = true;
 
+bool IsAgentMode = false;
+bool IsMaster = false;
 bool IsRealCluster = true;
 bool ZClientEnabled = true;
 char Node_name[MAX_PROCESSOR_NAME] = {'\0'};
@@ -51,18 +54,12 @@
 int MyPNID = -1;
 int MyNid = -1;
 int MyPid = -1;
+int InstanceId = -1;
+extern CNodeContainer *Nodes;
 
 CZClient    *ZClient = NULL;
 CMonLog     *MonLog =  NULL;
 
-void HandleAssignMonitorLeader ( const char* failedMaster )
-{
-    const char method_name[] = "HandleAssignMonitorLeader";
-    TRACE_ENTRY;
-    failedMaster = failedMaster;
-    TRACE_EXIT;
-}
-
 void HandleMyNodeExpiration( void )
 {
     const char method_name[] = "HandleMyNodeExpiration";
@@ -75,6 +72,42 @@
     exit( 1  );
 }
 
+void HandleNodeChange( const char *nodeName )
+{
+    const char method_name[] = "HandleNodeChange";
+    TRACE_ENTRY;
+    printf( "%s@%d Node %s znode changed!\n"
+          , method_name, __LINE__, nodeName );
+    TRACE_EXIT;
+}
+
+void HandleNodeConfigurationChange( void )
+{
+    const char method_name[] = "HandleNodeConfigurationChange";
+    TRACE_ENTRY;
+    printf( "%s@%d Node configuration changed!\n"
+          , method_name, __LINE__ );
+    TRACE_EXIT;
+}
+
+void HandleNodeCreated( const char *nodeName )
+{
+    const char method_name[] = "HandleNodeCreated";
+    TRACE_ENTRY;
+    printf( "%s@%d Node %s znode created!\n"
+          , method_name, __LINE__, nodeName );
+    TRACE_EXIT;
+}
+
+void HandleNodeError( const char *nodeName )
+{
+    const char method_name[] = "HandleNodeError";
+    TRACE_ENTRY;
+    printf( "%s@%d Node %s ERROR child!\n"
+          , method_name, __LINE__, nodeName );
+    TRACE_EXIT;
+}
+
 void HandleNodeExpiration( const char *nodeName )
 {
     const char method_name[] = "HandleNodeExpiration";
@@ -92,6 +125,9 @@
     if ( ZClientEnabled )
     {
         string       hostName;
+        string       instanceId;
+        string       trafodionRootZNode;
+        stringstream ss;
         string       zkQuorumHosts;
         stringstream zkQuorumPort;
         char *env;
@@ -163,9 +199,50 @@
             }
         }
     
+        env = getenv("TRAF_ROOT_ZNODE");
+        if ( env )
+        {
+            ss.str( "" );
+            ss << env;
+            trafodionRootZNode = ss.str();
+        }
+        else
+        {
+            ss.str( "" );
+            ss << "/trafodion";
+            trafodionRootZNode = ss.str();
+
+            char la_buf[MON_STRING_BUF_SIZE];
+            sprintf( la_buf
+                   , "[%s], Environment variable TRAF_ROOT_ZNODE is undefined, defaulting trafodionRootZNode=%s\n"
+                   , method_name, trafodionRootZNode.c_str() );
+        }
+
+        env = getenv("TRAF_INSTANCE_ID");
+        if ( env && isdigit(*env) )
+        {
+            InstanceId = atoi(env);
+            ss.str( "" );
+            ss << "/" << InstanceId;
+            instanceId = ss.str();
+        }
+        else
+        {
+            InstanceId = 1;
+            ss.str( "" );
+            ss << "/" << InstanceId;
+            instanceId = ss.str();
+
+            char la_buf[MON_STRING_BUF_SIZE];
+            sprintf( la_buf
+                   , "[%s], Environment variable TRAF_INSTANCE_ID is undefined, defaulting instanceId=%s\n"
+                   , method_name, instanceId.c_str() );
+            printf( "%s", la_buf);
+        }
+
         ZClient = new CZClient( zkQuorumPort.str().c_str()
-                              , ZCLIENT_TRAFODION_ZNODE
-                              , ZCLIENT_INSTANCE_ZNODE );
+                              , trafodionRootZNode.c_str()
+                              , instanceId.c_str() );
         if ( ZClient == NULL )
         {
             char buf[MON_STRING_BUF_SIZE];
diff --git a/core/sqf/monitor/linux/attach.cxx b/core/sqf/monitor/test-legacy/attach.cxx
similarity index 100%
rename from core/sqf/monitor/linux/attach.cxx
rename to core/sqf/monitor/test-legacy/attach.cxx
diff --git a/core/sqf/monitor/linux/client.cxx b/core/sqf/monitor/test-legacy/client.cxx
similarity index 100%
rename from core/sqf/monitor/linux/client.cxx
rename to core/sqf/monitor/test-legacy/client.cxx
diff --git a/core/sqf/monitor/linux/client2.cxx b/core/sqf/monitor/test-legacy/client2.cxx
similarity index 100%
rename from core/sqf/monitor/linux/client2.cxx
rename to core/sqf/monitor/test-legacy/client2.cxx
diff --git a/core/sqf/monitor/linux/getseq.cxx b/core/sqf/monitor/test-legacy/getseq.cxx
similarity index 100%
rename from core/sqf/monitor/linux/getseq.cxx
rename to core/sqf/monitor/test-legacy/getseq.cxx
diff --git a/core/sqf/monitor/linux/montest001 b/core/sqf/monitor/test-legacy/montest001
similarity index 100%
rename from core/sqf/monitor/linux/montest001
rename to core/sqf/monitor/test-legacy/montest001
diff --git a/core/sqf/monitor/linux/montest_readme.txt b/core/sqf/monitor/test-legacy/montest_readme.txt
similarity index 100%
rename from core/sqf/monitor/linux/montest_readme.txt
rename to core/sqf/monitor/test-legacy/montest_readme.txt
diff --git a/core/sqf/monitor/linux/montest_run.virtual b/core/sqf/monitor/test-legacy/montest_run.virtual
similarity index 100%
rename from core/sqf/monitor/linux/montest_run.virtual
rename to core/sqf/monitor/test-legacy/montest_run.virtual
diff --git a/core/sqf/monitor/linux/montim.cxx b/core/sqf/monitor/test-legacy/montim.cxx
similarity index 100%
rename from core/sqf/monitor/linux/montim.cxx
rename to core/sqf/monitor/test-legacy/montim.cxx
diff --git a/core/sqf/monitor/linux/notify.cxx b/core/sqf/monitor/test-legacy/notify.cxx
similarity index 100%
rename from core/sqf/monitor/linux/notify.cxx
rename to core/sqf/monitor/test-legacy/notify.cxx
diff --git a/core/sqf/monitor/linux/nsclient.cxx b/core/sqf/monitor/test-legacy/nsclient.cxx
similarity index 100%
rename from core/sqf/monitor/linux/nsclient.cxx
rename to core/sqf/monitor/test-legacy/nsclient.cxx
diff --git a/core/sqf/monitor/linux/nsserver.cxx b/core/sqf/monitor/test-legacy/nsserver.cxx
similarity index 100%
rename from core/sqf/monitor/linux/nsserver.cxx
rename to core/sqf/monitor/test-legacy/nsserver.cxx
diff --git a/core/sqf/monitor/linux/pingpong2.cxx b/core/sqf/monitor/test-legacy/pingpong2.cxx
similarity index 100%
rename from core/sqf/monitor/linux/pingpong2.cxx
rename to core/sqf/monitor/test-legacy/pingpong2.cxx
diff --git a/core/sqf/monitor/linux/server.cxx b/core/sqf/monitor/test-legacy/server.cxx
similarity index 100%
rename from core/sqf/monitor/linux/server.cxx
rename to core/sqf/monitor/test-legacy/server.cxx
diff --git a/core/sqf/monitor/test/spxCtrl.cxx b/core/sqf/monitor/test-legacy/spxCtrl.cxx
similarity index 98%
rename from core/sqf/monitor/test/spxCtrl.cxx
rename to core/sqf/monitor/test-legacy/spxCtrl.cxx
index 7591849..198800b 100644
--- a/core/sqf/monitor/test/spxCtrl.cxx
+++ b/core/sqf/monitor/test-legacy/spxCtrl.cxx
@@ -35,6 +35,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 
 #include "clio.h"
 #include "sqevlog/evl_sqlog_writer.h"
@@ -457,7 +458,6 @@
 
     int sendbuf;
     replyMsg_t recvbuf;
-    int rc;
     const int clientTag = 99;
     MPI_Status status;
 
@@ -465,9 +465,9 @@
     {
         // Tell the SPX process to exit
         sendbuf = CMD_END;
-        rc = XMPI_Sendrecv (&sendbuf, 1, MPI_INT, 0, clientTag,
-                           &recvbuf, 1, MPI_INT, MPI_ANY_SOURCE,
-                           MPI_ANY_TAG, spxProcess[i].comm, &status);
+        XMPI_Sendrecv (&sendbuf, 1, MPI_INT, 0, clientTag,
+                      &recvbuf, 1, MPI_INT, MPI_ANY_SOURCE,
+                      MPI_ANY_TAG, spxProcess[i].comm, &status);
     }
 
     printf("SPX Process Test:\t\t%s\n", (testSuccess) ? "PASSED" : "FAILED");
diff --git a/core/sqf/monitor/test/spxCtrl.h b/core/sqf/monitor/test-legacy/spxCtrl.h
similarity index 100%
rename from core/sqf/monitor/test/spxCtrl.h
rename to core/sqf/monitor/test-legacy/spxCtrl.h
diff --git a/core/sqf/monitor/test/spxProc.cxx b/core/sqf/monitor/test-legacy/spxProc.cxx
similarity index 100%
rename from core/sqf/monitor/test/spxProc.cxx
rename to core/sqf/monitor/test-legacy/spxProc.cxx
diff --git a/core/sqf/monitor/test/spxTest.sub b/core/sqf/monitor/test-legacy/spxTest.sub
similarity index 100%
rename from core/sqf/monitor/test/spxTest.sub
rename to core/sqf/monitor/test-legacy/spxTest.sub
diff --git a/core/sqf/monitor/linux/test.sh b/core/sqf/monitor/test-legacy/test.sh
similarity index 100%
rename from core/sqf/monitor/linux/test.sh
rename to core/sqf/monitor/test-legacy/test.sh
diff --git a/core/sqf/monitor/linux/test.sub b/core/sqf/monitor/test-legacy/test.sub
similarity index 100%
rename from core/sqf/monitor/linux/test.sub
rename to core/sqf/monitor/test-legacy/test.sub
diff --git a/core/sqf/monitor/linux/test1.sub b/core/sqf/monitor/test-legacy/test1.sub
similarity index 100%
rename from core/sqf/monitor/linux/test1.sub
rename to core/sqf/monitor/test-legacy/test1.sub
diff --git a/core/sqf/monitor/linux/test10.sub b/core/sqf/monitor/test-legacy/test10.sub
similarity index 100%
rename from core/sqf/monitor/linux/test10.sub
rename to core/sqf/monitor/test-legacy/test10.sub
diff --git a/core/sqf/monitor/linux/test10.sub.ln b/core/sqf/monitor/test-legacy/test10.sub.ln
similarity index 100%
rename from core/sqf/monitor/linux/test10.sub.ln
rename to core/sqf/monitor/test-legacy/test10.sub.ln
diff --git a/core/sqf/monitor/linux/test10a.sub.ln b/core/sqf/monitor/test-legacy/test10a.sub.ln
similarity index 100%
rename from core/sqf/monitor/linux/test10a.sub.ln
rename to core/sqf/monitor/test-legacy/test10a.sub.ln
diff --git a/core/sqf/monitor/linux/test10b.sub.ln b/core/sqf/monitor/test-legacy/test10b.sub.ln
similarity index 100%
rename from core/sqf/monitor/linux/test10b.sub.ln
rename to core/sqf/monitor/test-legacy/test10b.sub.ln
diff --git a/core/sqf/monitor/linux/test11.sub b/core/sqf/monitor/test-legacy/test11.sub
similarity index 100%
rename from core/sqf/monitor/linux/test11.sub
rename to core/sqf/monitor/test-legacy/test11.sub
diff --git a/core/sqf/monitor/linux/test11.sub.ln b/core/sqf/monitor/test-legacy/test11.sub.ln
similarity index 100%
rename from core/sqf/monitor/linux/test11.sub.ln
rename to core/sqf/monitor/test-legacy/test11.sub.ln
diff --git a/core/sqf/monitor/linux/test12.sub b/core/sqf/monitor/test-legacy/test12.sub
similarity index 100%
rename from core/sqf/monitor/linux/test12.sub
rename to core/sqf/monitor/test-legacy/test12.sub
diff --git a/core/sqf/monitor/linux/test12a.sub b/core/sqf/monitor/test-legacy/test12a.sub
similarity index 100%
rename from core/sqf/monitor/linux/test12a.sub
rename to core/sqf/monitor/test-legacy/test12a.sub
diff --git a/core/sqf/monitor/linux/test12b.sub b/core/sqf/monitor/test-legacy/test12b.sub
similarity index 100%
rename from core/sqf/monitor/linux/test12b.sub
rename to core/sqf/monitor/test-legacy/test12b.sub
diff --git a/core/sqf/monitor/linux/test2.cmd b/core/sqf/monitor/test-legacy/test2.cmd
similarity index 100%
rename from core/sqf/monitor/linux/test2.cmd
rename to core/sqf/monitor/test-legacy/test2.cmd
diff --git a/core/sqf/monitor/linux/test2.sub b/core/sqf/monitor/test-legacy/test2.sub
similarity index 100%
rename from core/sqf/monitor/linux/test2.sub
rename to core/sqf/monitor/test-legacy/test2.sub
diff --git a/core/sqf/monitor/linux/test3.cmd b/core/sqf/monitor/test-legacy/test3.cmd
similarity index 100%
rename from core/sqf/monitor/linux/test3.cmd
rename to core/sqf/monitor/test-legacy/test3.cmd
diff --git a/core/sqf/monitor/linux/test3.sub b/core/sqf/monitor/test-legacy/test3.sub
similarity index 100%
rename from core/sqf/monitor/linux/test3.sub
rename to core/sqf/monitor/test-legacy/test3.sub
diff --git a/core/sqf/monitor/linux/test3a.sub b/core/sqf/monitor/test-legacy/test3a.sub
similarity index 100%
rename from core/sqf/monitor/linux/test3a.sub
rename to core/sqf/monitor/test-legacy/test3a.sub
diff --git a/core/sqf/monitor/linux/test3b.sub b/core/sqf/monitor/test-legacy/test3b.sub
similarity index 100%
rename from core/sqf/monitor/linux/test3b.sub
rename to core/sqf/monitor/test-legacy/test3b.sub
diff --git a/core/sqf/monitor/linux/test4.cmd b/core/sqf/monitor/test-legacy/test4.cmd
similarity index 100%
rename from core/sqf/monitor/linux/test4.cmd
rename to core/sqf/monitor/test-legacy/test4.cmd
diff --git a/core/sqf/monitor/linux/test4.sub b/core/sqf/monitor/test-legacy/test4.sub
similarity index 100%
rename from core/sqf/monitor/linux/test4.sub
rename to core/sqf/monitor/test-legacy/test4.sub
diff --git a/core/sqf/monitor/linux/test5.sub b/core/sqf/monitor/test-legacy/test5.sub
similarity index 100%
rename from core/sqf/monitor/linux/test5.sub
rename to core/sqf/monitor/test-legacy/test5.sub
diff --git a/core/sqf/monitor/linux/test6.sub b/core/sqf/monitor/test-legacy/test6.sub
similarity index 100%
rename from core/sqf/monitor/linux/test6.sub
rename to core/sqf/monitor/test-legacy/test6.sub
diff --git a/core/sqf/monitor/linux/test7.sub b/core/sqf/monitor/test-legacy/test7.sub
similarity index 100%
rename from core/sqf/monitor/linux/test7.sub
rename to core/sqf/monitor/test-legacy/test7.sub
diff --git a/core/sqf/monitor/linux/test8-10.sub.sn b/core/sqf/monitor/test-legacy/test8-10.sub.sn
similarity index 100%
rename from core/sqf/monitor/linux/test8-10.sub.sn
rename to core/sqf/monitor/test-legacy/test8-10.sub.sn
diff --git a/core/sqf/monitor/linux/test8-8.sub.sn b/core/sqf/monitor/test-legacy/test8-8.sub.sn
similarity index 100%
rename from core/sqf/monitor/linux/test8-8.sub.sn
rename to core/sqf/monitor/test-legacy/test8-8.sub.sn
diff --git a/core/sqf/monitor/linux/test8.cmd b/core/sqf/monitor/test-legacy/test8.cmd
similarity index 100%
rename from core/sqf/monitor/linux/test8.cmd
rename to core/sqf/monitor/test-legacy/test8.cmd
diff --git a/core/sqf/monitor/linux/test8.sub b/core/sqf/monitor/test-legacy/test8.sub
similarity index 100%
rename from core/sqf/monitor/linux/test8.sub
rename to core/sqf/monitor/test-legacy/test8.sub
diff --git a/core/sqf/monitor/linux/test8.sub.ln b/core/sqf/monitor/test-legacy/test8.sub.ln
similarity index 100%
rename from core/sqf/monitor/linux/test8.sub.ln
rename to core/sqf/monitor/test-legacy/test8.sub.ln
diff --git a/core/sqf/monitor/linux/test8.sub.sn b/core/sqf/monitor/test-legacy/test8.sub.sn
similarity index 100%
rename from core/sqf/monitor/linux/test8.sub.sn
rename to core/sqf/monitor/test-legacy/test8.sub.sn
diff --git a/core/sqf/monitor/linux/test9.sub b/core/sqf/monitor/test-legacy/test9.sub
similarity index 100%
rename from core/sqf/monitor/linux/test9.sub
rename to core/sqf/monitor/test-legacy/test9.sub
diff --git a/core/sqf/monitor/linux/testall.cmd b/core/sqf/monitor/test-legacy/testall.cmd
similarity index 100%
rename from core/sqf/monitor/linux/testall.cmd
rename to core/sqf/monitor/test-legacy/testall.cmd
diff --git a/core/sqf/monitor/linux/testall.sh b/core/sqf/monitor/test-legacy/testall.sh
similarity index 100%
rename from core/sqf/monitor/linux/testall.sh
rename to core/sqf/monitor/test-legacy/testall.sh
diff --git a/core/sqf/monitor/linux/testall.sub b/core/sqf/monitor/test-legacy/testall.sub
similarity index 100%
rename from core/sqf/monitor/linux/testall.sub
rename to core/sqf/monitor/test-legacy/testall.sub
diff --git a/core/sqf/monitor/linux/testspx.cxx b/core/sqf/monitor/test-legacy/testspx.cxx
similarity index 100%
rename from core/sqf/monitor/linux/testspx.cxx
rename to core/sqf/monitor/test-legacy/testspx.cxx
diff --git a/core/sqf/monitor/linux/testtm.cxx b/core/sqf/monitor/test-legacy/testtm.cxx
similarity index 100%
rename from core/sqf/monitor/linux/testtm.cxx
rename to core/sqf/monitor/test-legacy/testtm.cxx
diff --git a/core/sqf/monitor/test/tmSyncCluster.sub b/core/sqf/monitor/test-legacy/tmSyncCluster.sub
similarity index 100%
rename from core/sqf/monitor/test/tmSyncCluster.sub
rename to core/sqf/monitor/test-legacy/tmSyncCluster.sub
diff --git a/core/sqf/monitor/test/tmSyncCtrl.cxx b/core/sqf/monitor/test-legacy/tmSyncCtrl.cxx
similarity index 100%
rename from core/sqf/monitor/test/tmSyncCtrl.cxx
rename to core/sqf/monitor/test-legacy/tmSyncCtrl.cxx
diff --git a/core/sqf/monitor/test/tmSyncCtrl.h b/core/sqf/monitor/test-legacy/tmSyncCtrl.h
similarity index 100%
rename from core/sqf/monitor/test/tmSyncCtrl.h
rename to core/sqf/monitor/test-legacy/tmSyncCtrl.h
diff --git a/core/sqf/monitor/test/tmSyncTest.cxx b/core/sqf/monitor/test-legacy/tmSyncTest.cxx
similarity index 100%
rename from core/sqf/monitor/test/tmSyncTest.cxx
rename to core/sqf/monitor/test-legacy/tmSyncTest.cxx
diff --git a/core/sqf/monitor/test/tmSyncVirtual.sub b/core/sqf/monitor/test-legacy/tmSyncVirtual.sub
similarity index 100%
rename from core/sqf/monitor/test/tmSyncVirtual.sub
rename to core/sqf/monitor/test-legacy/tmSyncVirtual.sub
diff --git a/core/sqf/monitor/test/Makefile b/core/sqf/monitor/test/Makefile
old mode 100755
new mode 100644
index 889468f..c4174f5
--- a/core/sqf/monitor/test/Makefile
+++ b/core/sqf/monitor/test/Makefile
@@ -27,7 +27,7 @@
 #FLAGS+= -fdiagnostics-show-option
 FLAGS+= -Wall -Wextra
 
-LIBS+= -L$(LIBEXPDIR) -L$(ZOOKEEPER_DIR)/lib -lsqlite3 -ltrafconfig
+LIBS+= -L$(LIBEXPDIR) -L$(ZOOKEEPER_DIR)/lib -lsqlite3 -ltrafconfig -lpthread
 
 INCLUDES =  -I$(INCEXPDIR) -I../linux -I../../src/trafconf
 
@@ -58,11 +58,12 @@
 TEST_PGMS	+= $(OUTDIR)/procCreate
 TEST_PGMS	+= $(OUTDIR)/dtmCtrl
 TEST_PGMS	+= $(OUTDIR)/dtmProc
-TEST_PGMS	+= $(OUTDIR)/spxCtrl
-TEST_PGMS	+= $(OUTDIR)/spxProc
+TEST_PGMS	+= $(OUTDIR)/dtm
+#TEST_PGMS	+= $(OUTDIR)/spxCtrl
+#TEST_PGMS	+= $(OUTDIR)/spxProc
 #TEST_PGMS	+= $(OUTDIR)/tmSyncTest
 #TEST_PGMS	+= $(OUTDIR)/tmSyncCtrl
-TEST_PGMS	+= $(OUTDIR)/dummy
+#TEST_PGMS	+= $(OUTDIR)/dummy
 
 # Compile rules for building tests
 $(OUTDIR)/%:%.cxx $(CLIENTOBJS)
@@ -99,14 +100,15 @@
 $(OUTDIR)/deathWatch: deathWatch.cxx $(CLIENTOBJS)
 $(OUTDIR)/persistentProc: persistentProc.cxx $(CLIENTOBJS)
 $(OUTDIR)/procCreate : procCreate.cxx $(CLIENTOBJS)
-$(OUTDIR)/dtmCtrl : spxCtrl.cxx $(CLIENTOBJS)
-$(OUTDIR)/dtmProc : spxProc.cxx $(CLIENTOBJS)
-$(OUTDIR)/spxCtrl : spxCtrl.cxx $(CLIENTOBJS)
-$(OUTDIR)/spxProc : spxProc.cxx $(CLIENTOBJS)
-$(OUTDIR)/tmSyncTest: tmSyncTest.cxx  $(CLIENTOBJS)
-$(OUTDIR)/tmSyncCtrl: tmSyncCtrl.cxx  $(CLIENTOBJS)
+$(OUTDIR)/dtmCtrl : dtmCtrl.cxx $(CLIENTOBJS)
+$(OUTDIR)/dtmProc : dtmProc.cxx $(CLIENTOBJS)
 $(OUTDIR)/montestutil.o: montestutil.cxx ../linux/msgdef.h
 
+#$(OUTDIR)/spxCtrl : spxCtrl.cxx $(CLIENTOBJS)
+#$(OUTDIR)/spxProc : spxProc.cxx $(CLIENTOBJS)
+#$(OUTDIR)/tmSyncTest: tmSyncTest.cxx  $(CLIENTOBJS)
+#$(OUTDIR)/tmSyncCtrl: tmSyncCtrl.cxx  $(CLIENTOBJS)
+
 clean:
 	@echo -rm -f $(OUTDIR)/*.o $(TEST_PGMS)
 	-rm -f $(OUTDIR)/*.o $(TEST_PGMS)
diff --git a/core/sqf/monitor/test/notes.txt b/core/sqf/monitor/test/aareadme.txt
old mode 100755
new mode 100644
similarity index 73%
rename from core/sqf/monitor/test/notes.txt
rename to core/sqf/monitor/test/aareadme.txt
index 4976288..8e73c05
--- a/core/sqf/monitor/test/notes.txt
+++ b/core/sqf/monitor/test/aareadme.txt
@@ -19,17 +19,59 @@
 #
 # @@@ END COPYRIGHT @@@
 
-setup:
-1.  Make sure sqconfig is set up correctly and sqgen run.
-    (some tests require a minimum number of nodes.  Tests should check
-     that number of nodes needed are actually available)
+##################
+Monitor unit tests
+##################
 
-2.  make sure directory containing executables is on PATH
+Scripts:
+  o runtest - driver test script for all test
+
+    Execute 'runtest' with no run time options to display usage:
+    
+    Example:
+
+    runtest { -cluster | -virtual } [ -nogen | -trace | -test <num> ]
+
+    Where: <num> is one of the following tests:
+             1     - Child Exit
+             2     - Multi-Node
+             3     - Registry
+             4     - Death Notice
+             5     - Persistent Process
+             6     - DTM Process
+             7     - Process Create
+             8     - Node down before startup
+
+  o monpkillall   - to forcibly terminate test programs 
+  o monpstat      - test programs process status
+  o monshell      - shell wrapper supporting virtual nodes configuration
+  o montestgen    - to compile test configuration with virtual nodes
+  o montestgen.pl - supports compile of test configuration with virtual nodes
+
+Configuration files used by 'runtest':
+
+  o sqconfig.monitor.cluster  (-cluster run time option)
+    - Update the 'node section' in this file to execute tests in a 'real cluster'
+  o sqconfig.monitor.virtual  (-virtual run time option)
+    - No changes required
+  o sqconfig.persist
+    - No changes required 
+  o sqconfig.persist.dtm
+    - No changes required 
+
+Setup:
+
+1.  Build the test programs before running 'runtest'
+2.  Make sure directory containing executables is on PATH
         export PATH=$PATH:$PWD/Linux-x86_64/dbg
 
-3.  Set the following environment variable so the shell looks in the
-    current directory instead of $TRAF_HOME/sql/scripts:
-       export SQ_SHELL_NOCWD=1
+Test results:
+
+  o Each test displays a PASSED or FAILED indicating the result of the test
+  o In addition, after each test is executed, the instance is 'shutdown' and
+    a check is made to determine that the 'shutdown' stopped all
+    processes with a PASSED or FAILED indicating that the shutdown
+    was successful or not.
 
 -----------------------------------
 Child Exit
@@ -42,12 +84,7 @@
 Discussion:
 
 How to run:
-   sqshell
-      startup
-      exec shell childExit.sub
-      shutdown
-      quit
-
+   runtest { -cluster | -virtual } [-nogen] [-trace] -test 1 
 
 Files for this test:
    childExitChild.cxx
@@ -91,11 +128,7 @@
 Discussion:
 
 How to run:
-   sqshell
-      startup
-      exec shell multiNode.sub
-      shutdown
-      quit
+   runtest { -cluster | -virtual } [-nogen] [-trace] -test 2
 
 Files for this test:
    client.cxx
@@ -123,9 +156,7 @@
 Discussion:
 
 How to run:
-   sqshell
-      startup
-      exec shell regTest.sub
+   runtest { -cluster | -virtual } [-nogen] [-trace] -test 3
 
 Files for this test:
    regTestCtrl.cxx
@@ -193,9 +224,7 @@
    processes to receive a death notice for the same process.
 
 How to run:
-   sqshell
-      startup
-      exec shell deathNotice.sub
+   runtest { -cluster | -virtual } [-nogen] [-trace] -test 4
 
    If "Test PASSED" is output then the test passed.
 
@@ -234,9 +263,7 @@
    When the last shell exits, there should be no Trafodion processes running
 
 How to run:
-   sqshell
-      startup
-      exec shell persistentProc.sub
+   runtest { -cluster | -virtual } [-nogen] [-trace] -test 5
 
    If "Test PASSED" is output then the test passed.
 
@@ -253,91 +280,35 @@
    - exercise additional variations on restarting
 
 -----------------------------------
-tmSync
+DTM Process
 -----------------------------------
 
 Description:
-   Test TM sync requests with and without collisions
-
-How to run on a virtual cluster:
-   sqshell
-      startup
-      exec shell tmSyncVirtual.sub
-
-Relationship to original monitor tests:
-   Replaces test8.sub
-
-Discussion:
-   sub-test 1:
-      similar to sub-test 7 but spare node is available
-      every node that participates should commit transaction
-
-      all start transaction, 1 dies, spare node activated
-      (funky numbers depending on when node goes away)
-      total 5
-      total committed 5
-
-   sub-test 3:
-      only one tm starts 2 phase protocol for commit
-      other nodes always reply "commit"
-
-   sub-test 4:
-      node 1 starts transaction, no others do
-      all 6 abort the transaction
-
-   sub-test 5:
-      each tm starts 10 transactions
-      all commit transactions
-      total transactions = 60
-      commits = 60
-
-   sub-test 6:
-      similar to sub-test 5 but:
-         each tm starts 10 transactions
-         only 1 monitor's transaction is committed, other 5 aborted
-         commit should be 10
-         abort should be 50
-         total should be 60
-
-   sub-test 7:
-      all nodes start 1 transaction
-      1 node's transaction is comitted, others are aborted
-      one node goes down and no spare is available
-      abort 4, commit 1, total 5
-
-
-To do:
-1) need to verify on real cluster
-2) add real-cluster tests (as run originally using test8.sub.sn,
-   test8-8.sub.sn, test8-10.sub.sn) [1/6/12: waiting for fix for
-   spare node startup problem]
-
------------------------------------
-spx test
------------------------------------
-
-Description:
-   Verify monitor capabilities for SPX process type (SeaPilot Proxy Process)
+   Exercises the monitor DTM process management rules.
 
 Discussion:
    The test performs the following steps:
-   1.  Verifies that the configuration of Trafodion nodes is sufficient
-       for the test.
-   2.  Verify ability to start an SPX process on each of the physical nodes
-   3.  Verify that if an SPX process dies each of the other SPX
-       process receives a process death notification.
-   4.  Verify that can only start one SPX process on a given logical node.
-   5.  Verify that cannot start an SPX process on a logical node that
-       shares a physical node with another logical node where an SPX
-       process is running.
+     1.  Verify ability to start an DTM process on each of the logical nodes
+     2.  Verify that if an DTM process dies each of the other DTM
+         process DOES NOT receiv a process death OR tmRestarted notification.
+     3.  Verify that DTM as a persistent process when restarted
+         sends TmReady request to monitor.
+     4.  Verify that only one DTM process can be started on a logical node.
+     5.  Verify that DTM as a persistent process and exceeds restart limits
+         brings node down.
+
+   When the last shell exits, there should be no Trafodion processes running
 
 How to run:
-   sqshell
-      startup
-      exec shell spxTest.sub
+   runtest { -cluster | -virtual } [-nogen] [-trace] -test 6
+
+   If "Test PASSED" is output then the test passed.
 
 Relationship to original monitor tests:
-   Replaces test11.sub (and includes new test capabilities)
+   None
+
+Additional test ideas:
+   - none
 
 -----------------------------------
 process creation test
@@ -361,9 +332,27 @@
 
 
 How to run:
-   sqshell
-      startup
-      exec shell procCreate.sub
+   runtest { -cluster | -virtual } [-nogen] [-trace] -test 7
+
+Relationship to original monitor tests:
+   none
+
+-----------------------------------
+Shutdown and node down before startup test
+-----------------------------------
+
+Description:
+   Verify that process cleanup occurs in monitor when process is created
+   but has not sent its 'startup' message request to the monitor when
+   the instance is shutdown or the node goes down.
+
+Discussion:
+   The test performs the following steps:
+   1. Creates processes
+
+
+How to run:
+   runtest { -cluster | -virtual } [-nogen] [-trace] -test 7
 
 Relationship to original monitor tests:
    none
@@ -381,11 +370,13 @@
    test5.sub
    test6.sub               deathNotice
    test7.sub
-   test8.sub               tmSync
+   test8.sub               
    test10.sub              persistentProc
-   test11.sub              spxCtrl
+   test11.sub              
    test12.sub
       ---                  childExit
+      ---                  DTM process
+      ---                  Node down before startup
 
 ===========================================================================
 Tracing
diff --git a/core/sqf/monitor/test/childExit.sub b/core/sqf/monitor/test/childExit.sub
old mode 100755
new mode 100644
diff --git a/core/sqf/monitor/test/childExitChild.cxx b/core/sqf/monitor/test/childExitChild.cxx
old mode 100755
new mode 100644
index adc11f9..8c63208
--- a/core/sqf/monitor/test/childExitChild.cxx
+++ b/core/sqf/monitor/test/childExitChild.cxx
@@ -27,6 +27,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 #include "clio.h"
 #include "sqevlog/evl_sqlog_writer.h"
 #include "montestutil.h"
diff --git a/core/sqf/monitor/test/childExitCtrl.cxx b/core/sqf/monitor/test/childExitCtrl.cxx
old mode 100755
new mode 100644
index c86d7a9..7334ff6
--- a/core/sqf/monitor/test/childExitCtrl.cxx
+++ b/core/sqf/monitor/test/childExitCtrl.cxx
@@ -44,6 +44,8 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
+//#include <thread>
 #include "clio.h"
 #include "sqevlog/evl_sqlog_writer.h"
 #include "montestutil.h"
@@ -291,13 +293,9 @@
     // Kill "childExitParent"
     util.requestKill ( "$PROCA", verifier );
 
-    // Wait until all death notices received or time-out
-    for (int i=0; i<5; i++)
-    {
-        if (deathNoticeCount == procListCount) break;
-        sleep(1);
-    }
-
+    // Wait for all death notices
+    sleep(5);
+    
     // Verify that got all death notices
     for (int i=0; i<procListCount; i++)
     {
diff --git a/core/sqf/monitor/test/childExitParent.cxx b/core/sqf/monitor/test/childExitParent.cxx
old mode 100755
new mode 100644
index 702f6d0..3ffefef
--- a/core/sqf/monitor/test/childExitParent.cxx
+++ b/core/sqf/monitor/test/childExitParent.cxx
@@ -28,6 +28,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 #include "clio.h"
 #include "sqevlog/evl_sqlog_writer.h"
 #include "montestutil.h"
diff --git a/core/sqf/monitor/test/client.cxx b/core/sqf/monitor/test/client.cxx
old mode 100755
new mode 100644
index 463b28b..9a9c19c
--- a/core/sqf/monitor/test/client.cxx
+++ b/core/sqf/monitor/test/client.cxx
@@ -25,6 +25,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 
 #include "clio.h"
 #include "sqevlog/evl_sqlog_writer.h"
diff --git a/core/sqf/monitor/test/deathNotice.cxx b/core/sqf/monitor/test/deathNotice.cxx
old mode 100755
new mode 100644
index 8e13c68..65b05d5
--- a/core/sqf/monitor/test/deathNotice.cxx
+++ b/core/sqf/monitor/test/deathNotice.cxx
@@ -31,6 +31,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 
 #include "clio.h"
 #include "sqevlog/evl_sqlog_writer.h"
@@ -392,7 +393,6 @@
     int deathWatcherPid[MAX_WATCHERS];
     Verifier_t deathWatcherVerifier[MAX_WATCHERS];
     char deathWatcherName[MAX_WATCHERS][25];
-    bool deathWatcherUp[MAX_WATCHERS];
     char *serverArgs[1] = {(char *) "-t"};
     int deathWatchers = 0;
 
@@ -444,7 +444,6 @@
     for (int i = 0; i < MAX_WATCHERS; i++)
     {
         deathWatcherComm[i] = MPI_COMM_NULL;
-        deathWatcherUp[i] = false;
 
         if (!util.requestNewProcess( i  // created death watcher in different nids
                                    , ProcessType_Generic
diff --git a/core/sqf/monitor/test/deathNotice.h b/core/sqf/monitor/test/deathNotice.h
old mode 100755
new mode 100644
diff --git a/core/sqf/monitor/test/deathNotice.sub b/core/sqf/monitor/test/deathNotice.sub
old mode 100755
new mode 100644
diff --git a/core/sqf/monitor/test/deathUnreg.cxx b/core/sqf/monitor/test/deathUnreg.cxx
old mode 100755
new mode 100644
diff --git a/core/sqf/monitor/test/deathWatch.cxx b/core/sqf/monitor/test/deathWatch.cxx
old mode 100755
new mode 100644
diff --git a/core/sqf/monitor/test/dtm.cxx b/core/sqf/monitor/test/dtm.cxx
new file mode 100644
index 0000000..0008bd4
--- /dev/null
+++ b/core/sqf/monitor/test/dtm.cxx
@@ -0,0 +1,320 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+// @@@ START COPYRIGHT @@@
+//
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// @@@ END COPYRIGHT @@@
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include "clio.h"
+#include "sqevlog/evl_sqlog_writer.h"
+#include "montestutil.h"
+#include "xmpi.h"
+#include "dtmCtrl.h"
+
+MonTestUtil util;
+
+long trace_settings = 0;
+FILE *shell_locio_trace_file = NULL;
+bool tracing = false;
+
+const char *MyName;
+int gv_ms_su_nid = -1;          // Local IO nid to make compatible w/ Seabed
+SB_Verif_Type  gv_ms_su_verif = -1;
+char ga_ms_su_c_port[MPI_MAX_PORT_NAME] = {0};
+
+int deathNoticesReceived = 0;
+int tmRestartedNoticesReceived = 0;
+
+pthread_mutex_t     notice_mutex;
+pthread_cond_t      notice_cv;
+bool                notice_signaled = false;
+
+bool                shutdownSent = false;
+
+
+void lock_notice()
+{
+    int rc = pthread_mutex_lock(&notice_mutex);
+
+    if (rc != 0)
+    {
+        printf("[%s] - Unable to lock notice mutex: %s (%d)\n",
+                     MyName, strerror(errno), errno);
+    }
+}
+
+
+void unlock_notice()
+{
+    int rc = pthread_mutex_unlock(&notice_mutex);
+
+    if (rc != 0)
+    {
+        printf("[%s] - Unable to unlock notice mutex: %s (%d)\n",
+                     MyName, strerror(errno), errno);
+    }
+}
+
+int signal_notice() 
+{
+    int rc = 0;
+
+    notice_signaled = true;
+    rc = pthread_cond_broadcast(&notice_cv);
+    if ( rc != 0) 
+    {
+        errno = rc;
+        printf("[%s] - Unable to signal notice: %s (%d)\n",
+                     MyName, strerror(errno), errno);
+        rc = -1;
+    }
+
+    return( rc );
+}
+
+int wait_on_notice( void ) 
+{
+    int rc = 0;
+
+    if ( ! notice_signaled ) 
+    {
+        rc = pthread_cond_wait(&notice_cv, &notice_mutex);
+        if ( rc != 0) 
+        {
+            errno = rc;
+            printf("[%s] - Unable to signal notice: %s (%d)\n",
+                         MyName, strerror(errno), errno);
+            rc = -1;
+        }
+    }
+    notice_signaled = false;
+
+    return( rc );
+}
+
+bool wait_for_notice()
+{
+    int rc = -1;
+    printf ("[%s] Waiting for notice.\n", MyName);
+
+    lock_notice();
+    rc = wait_on_notice();
+    if ( rc == -1 )
+    {
+        exit( 1);
+    }
+    unlock_notice();
+
+    return ( rc == 0 );
+}
+
+// Routine for handling notices:
+//   NodeDown, NodeUp, ProcessDeath, Shutdown, TmSyncAbort, TmSyncCommit
+void recv_notice_msg(struct message_def *recv_msg, int )
+{
+    if ( recv_msg->type == MsgType_ProcessDeath )
+    {
+        printf( "[%s] Process death notice received for %s (%d, %d:%d),"
+                " trans_id=%lld.%lld.%lld.%lld., aborted=%d\n"
+              , MyName 
+              , recv_msg->u.request.u.death.process_name 
+              , recv_msg->u.request.u.death.nid
+              , recv_msg->u.request.u.death.pid
+              , recv_msg->u.request.u.death.verifier
+              , recv_msg->u.request.u.death.trans_id.txid[0]
+              , recv_msg->u.request.u.death.trans_id.txid[1]
+              , recv_msg->u.request.u.death.trans_id.txid[2]
+              , recv_msg->u.request.u.death.trans_id.txid[3]
+              , recv_msg->u.request.u.death.aborted );
+        ++deathNoticesReceived;
+    }
+    else if ( recv_msg->type == MsgType_NodeDown )
+    {
+        printf("[%s] Node %d (%s) is DOWN.\n", MyName,
+               recv_msg->u.request.u.down.nid,
+               recv_msg->u.request.u.down.node_name);
+    }
+    else if ( recv_msg->type == MsgType_NodeUp )
+    {
+        printf("[%s] Node %d (%s) is UP.\n", MyName,
+               recv_msg->u.request.u.up.nid,
+               recv_msg->u.request.u.up.node_name);
+    }
+    else if ( recv_msg->type == MsgType_Shutdown )
+    {
+        printf("[%s] Shutdown (%d)!\n", MyName,
+               recv_msg->u.request.u.shutdown.level);
+        shutdownSent = true;
+    }
+    else
+    {
+        printf( "[%s] unexpected notice, type=%s\n"
+              , MyName
+              , MessageTypeString( recv_msg->type ) );
+    }
+    fflush (stdout );
+
+    lock_notice();
+    int rc = signal_notice();
+    if ( rc == -1 )
+    {
+        exit( 1);
+    }
+    unlock_notice();
+}
+
+
+void processCommands()
+{
+    MPI_Comm ctrlComm;
+    int rc;
+    MPI_Status status;
+    int recvbuf[6];
+    char sendbuf[100];
+    bool done = false;
+    const int serverTag = 100;
+
+    if ( tracing )
+    {
+        printf( "[%s] Port: %s\n", MyName, util.getPort( ) );
+    }
+
+    if ( tracing )
+    {
+        printf( "[%s] Wait to connect.\n", MyName );
+    }
+
+    XMPI_Comm_accept( util.getPort( ), MPI_INFO_NULL, 0, MPI_COMM_SELF
+                    , &ctrlComm );
+    XMPI_Comm_set_errhandler( ctrlComm, MPI_ERRORS_RETURN );
+
+    if ( tracing )
+    {
+        printf( "[%s] Connected.\n", MyName );
+    }
+
+    if ( !util.requestTmReady( ) )
+    {
+        done = true;
+    }
+
+    fflush (stdout );
+    replyMsg_t replyMsg;
+
+    do
+    {
+        rc = XMPI_Recv( recvbuf, 6, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG
+                      , ctrlComm, &status );
+        if ( rc == MPI_SUCCESS )
+        {
+            switch ( recvbuf[0] )
+            {
+            case CMD_GET_STATUS:
+                printf( "[%s] got command CMD_GET_STATUS: death count=%d, "
+                        "DTM restarted count=%d\n"
+                      , MyName
+                      , deathNoticesReceived
+                      , tmRestartedNoticesReceived );
+                replyMsg.deathNoticeCount = deathNoticesReceived;
+                replyMsg.tmRestartedNoticeCount = tmRestartedNoticesReceived;
+                break;
+            case CMD_EXIT:
+                printf( "[%s] got command CMD_EXIT.\n",MyName );
+                exit( 1 );
+                break;
+            case CMD_END:
+                printf( "[%s] got command CMD_END.\n",MyName );
+                done = true;
+                break;
+            default:
+                sprintf( sendbuf, "[%s] Received (%d:%d) UNKNOWN"
+                       , MyName, recvbuf[0], recvbuf[1] );
+                fflush (stdout );
+                abort();
+            }
+
+            rc = XMPI_Send( &replyMsg, (int) sizeof(replyMsg_t), MPI_CHAR, 0
+                          , serverTag, ctrlComm );
+        }
+        else
+        {  // Receive failed
+            printf( "[%s] XMPI_Recv failed, rc = (%d) %s\n"
+                  , MyName, rc, util.MPIErrMsg( rc ) );
+            done = true;
+        }
+        fflush (stdout );
+    }
+    while ( !done );
+
+    if ( tracing )
+    {
+        printf( "[%s] disconnecting.\n", MyName );
+    }
+    util.closeProcess( ctrlComm );
+}
+
+
+int main (int argc, char *argv[])
+{
+    util.processArgs( argc, argv );
+    tracing = util.getTrace( );
+    MyName = util.getProcName( );
+
+    util.InitLocalIO( );
+    assert( gp_local_mon_io );
+
+    // Set local io callback function for "notices"
+    gp_local_mon_io->set_cb( recv_notice_msg, "notice" );
+
+    util.requestStartup( );
+
+    //pause();
+
+    // Get and execute commands from controller process
+    //processCommands( );
+
+    // Wait for node down notice
+    if ( !shutdownSent )
+    {
+        printf("[%s] Waiting for shutdown notice!\n", MyName);
+        if( !wait_for_notice() )
+        {
+            printf("[%s] Failed to receive shutdown notice! Aborting\n",MyName);
+        }
+    }
+
+    // tell monitor we are exiting
+    util.requestExit( );
+
+    printf( "[%s] calling Finalize!\n", MyName );
+    fflush( stdout );
+    XMPI_Close_port( util.getPort( ) );
+    if ( gp_local_mon_io )
+    {
+        delete gp_local_mon_io;
+    }
+
+    exit( 0 );
+}
diff --git a/core/sqf/monitor/test/dtmCtrl.cxx b/core/sqf/monitor/test/dtmCtrl.cxx
index f617782..1fd4912 100644
--- a/core/sqf/monitor/test/dtmCtrl.cxx
+++ b/core/sqf/monitor/test/dtmCtrl.cxx
@@ -26,7 +26,7 @@
 // Test DTM process behavior
 //   1.  Verify ability to start an DTM process on each of the logical nodes
 //   2.  Verify that if an DTM process dies each of the other DTM
-//       process receives a process death and tmRestarted notifications.
+//       process DOES NOT receiv a process death OR tmRestarted notification.
 //   3.  Verify that DTM as a persistent process when restarted
 //       sends TmReady request to monitor.
 //   4.  Verify that only one DTM process can be started on a logical node.
@@ -37,6 +37,7 @@
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 
 #include "clio.h"
 #include "sqevlog/evl_sqlog_writer.h"
@@ -44,15 +45,13 @@
 #include "xmpi.h"
 #include "dtmCtrl.h"
 
-#define DTM_PROC_NAME_PREFIX        "$DTM"
+#define DTM_PROC_NAME_PREFIX        "$TM"
 #define DTM_RESTART_NID             2
 #define DTM_DOWN_NID                4
 #define DTM_KILL_DELAY              3
 #define DTM_RESTART_DELAY           3
-#define DTM_PERSIST_RETRIES         1
-#define DTM_PERSIST_DELAY          30
-#define PER_PERSIST_RETRIES         3
-#define PER_PERSIST_DELAY          30
+#define DTM_PERSIST_RETRIES         2
+#define DTM_OLD_TEST                0
 
 MonTestUtil util;
 
@@ -88,6 +87,7 @@
 int dtmProcessCount = 0;
 int persistentProcessCount = 0;
 int nidDown = -1;
+int nidUp = -1;
 
 pthread_mutex_t     notice_mutex;
 pthread_cond_t      notice_cv;
@@ -230,17 +230,6 @@
             }
         }
     }
-    else if ( recv_msg->type == MsgType_TmRestarted )
-    {
-        if ( tracing )
-            printf( "[%s] DTM Restarted in (nid=%d, pnid=%d, name=%s)\n"
-                  , MyName
-                  , recv_msg->u.request.u.tm_restart.nid
-                  , recv_msg->u.request.u.tm_restart.pnid
-                  , recv_msg->u.request.u.tm_restart.node_name );
-
-        ++dtmProcessCount;
-    }
     else if ( recv_msg->type == MsgType_NodeDown )
     {
         printf("[%s] Node %d (%s) is DOWN.\n", MyName,
@@ -249,6 +238,15 @@
         nidDown = recv_msg->u.request.u.down.nid;
         nodeDown = true;
     }
+    else if ( recv_msg->type == MsgType_NodeUp )
+    {
+        printf("[%s] Node %d (%s) is Up.\n", MyName,
+               recv_msg->u.request.u.up.nid,
+               recv_msg->u.request.u.up.node_name);
+        nidUp = recv_msg->u.request.u.up.nid;
+        nodeDown = false;
+        //++dtmProcessCount;
+    }
     else
     {
         printf( "[%s] unexpected notice, type=%s\n"
@@ -329,36 +327,11 @@
     bool testSuccess = true;
     char *childArgs[1] = {(char *) "-t"};
     char procName[MAX_PROCESS_NAME] = {0};
-    char value[25];
 
-    sprintf( procName, "$PP%03d", nid );
+    sprintf( procName, "$PP%d", nid );
     
-    // Set persistent nid
-    printf( "[%s] For process %s setting PERSIST_ZONES=%d\n"
-          , MyName, procName, nid);
-    sprintf( value, "%d", nid );
-    if (!util.requestSet( ConfigType_Process
-                        , procName
-                        , "PERSIST_ZONES"
-                        , value))
-    {
-        return false;
-    }
-
-    // Set count of times to restart and persistent "max time"
-    printf( "[%s] For process %s setting PERSIST_RETRIES=%d,%d\n"
-          ,   MyName, procName, PER_PERSIST_RETRIES, PER_PERSIST_DELAY);
-    sprintf(value, "%d,%d", PER_PERSIST_RETRIES, PER_PERSIST_DELAY);
-    if (!util.requestSet( ConfigType_Process
-                        , procName
-                        , "PERSIST_RETRIES"
-                        , value))
-    {
-        return false;
-    }
-
     if ( util.requestNewProcess( nid
-                               , ProcessType_Generic
+                               , ProcessType_PERSIST
                                , false
                                , procName
                                , "dtmProc"
@@ -586,39 +559,14 @@
     sleep( DTM_KILL_DELAY );
 }
 
-bool createDTM( int nid , const char *procNamePrefix )
+bool checkDTM( int nid , const char *procNamePrefix )
 {
     bool testSuccess = true;
     char *childArgs[1] = {(char *) "-t"};
     char procName[MAX_PROCESS_NAME] = {0};
-    char value[25];
 
-    sprintf( procName, "%s%03d", procNamePrefix, nid );
+    sprintf( procName, "%s%d", procNamePrefix, nid );
     
-    // Set persistent nid
-    printf( "[%s] For process %s setting PERSIST_ZONES=%d\n"
-          , MyName, procName, nid);
-    sprintf( value, "%d", nid );
-    if (!util.requestSet( ConfigType_Process
-                        , procName
-                        , "PERSIST_ZONES"
-                        , value))
-    {
-        return false;
-    }
-
-    // Set count of times to restart and persistent "max time"
-    printf( "[%s] For process %s setting PERSIST_RETRIES=%d,%d\n"
-          ,   MyName, procName, DTM_PERSIST_RETRIES, DTM_PERSIST_DELAY);
-    sprintf(value, "%d,%d", DTM_PERSIST_RETRIES, DTM_PERSIST_DELAY);
-    if (!util.requestSet( ConfigType_Process
-                        , procName
-                        , "PERSIST_RETRIES"
-                        , value))
-    {
-        return false;
-    }
-
     if ( util.requestNewProcess( nid
                                , ProcessType_DTM
                                , false
@@ -641,16 +589,40 @@
               , dtmProcess[nid].pid
               , dtmProcess[nid].verifier );
 
-        dtmProcess[nid].dead = false;
-
-        testSuccess = openDTM( nid );
+        testSuccess = false;
     }
     else
     {
-        dtmProcess[nid].dead = true;
         printf( "[%s] Failed to start DTM process %s on node %d\n"
-              , MyName, procName, nid );
-        testSuccess = false;
+              , MyName
+              , procName
+              , nid );
+        strcpy(dtmProcess[nid].procName,procName);
+        if( util.requestProcInfo( dtmProcess[nid].procName   
+                                , dtmProcess[nid].nid
+                                , dtmProcess[nid].pid
+                                , dtmProcess[nid].verifier ))
+        {
+            printf( "[%s] Existing DTM process with name %s on node %d with PID=%d and Verifier=%d\n"
+                  , MyName
+                  , dtmProcess[nid].procName
+                  , dtmProcess[nid].nid
+                  , dtmProcess[nid].pid
+                  , dtmProcess[nid].verifier );
+      
+            dtmProcess[nid].dead = false;
+            //testSuccess = true;
+            testSuccess = openDTM( nid );
+        }
+        else 
+        {
+            printf( "[%s] Failed to find DTM process %s on node %d\n"
+                  , MyName
+                  , procName
+                  , nid );
+            dtmProcess[nid].dead = true;
+            testSuccess = false;
+        }
     }
 
     return testSuccess;
@@ -796,10 +768,12 @@
     sleep( DTM_KILL_DELAY );
 }
 
-//   1.  Verify ability to start an DTM process on each of the logical nodes
+//   1.  Verify that primitive DTM process cannot be created on each logical node
+//   2.  Verify that primitive DTM process exists on each logical nodexs
 //
-//       One DTM process is created in each logical node.
-//       One generic persistent process is created in each logical node.
+//   DTM process created must fail in each logical node. 
+//   A DTM process should already have been created by Monitor as one of its 
+//   primitive processes.
 bool DTM_test1 ()
 {
     int prevNid = -1;
@@ -834,15 +808,16 @@
 
             prevNid = reqNid = nodeData->node[i].nid;
 
-            if ( createDTM( reqNid, DTM_PROC_NAME_PREFIX ) )
+            if ( checkDTM( reqNid, DTM_PROC_NAME_PREFIX ) )
             {
                 ++dtmProcessCount;
             }
             else
             {
-                printf( "[%s] Failed to start DTM process on node %d\n"
+                printf( "[%s] Failed to find primitive DTM process on node %d\n"
                       , MyName, reqNid );
                 testSuccess = false;
+                break;
             }
 
             if ( createPersistent( reqNid ) )
@@ -852,6 +827,7 @@
             else
             {
                 testSuccess = false;
+                break;
             }
         }
     }
@@ -872,12 +848,11 @@
     return testSuccess;
 }
 
-//   2.  Verify that if an DTM process dies each of the other DTM
-//       process receives a process death and tmRestarted notifications.
+//   2.  Verify that if an DTM process dies none of the other DTM
+//       process receives a process death or tmRestarted notifications.
 //
-//       Request the notice counts from each DTM process. All except the
-//       restarted DTM will return the notices received counts greater than
-//       zero.
+//       Request the notice counts from each DTM process. All non-restarted 
+//       DTMs will return the notices received counts equal to zero.
 bool DTM_test2 ()
 {
     bool testSuccess = true;
@@ -934,16 +909,16 @@
         }
     }
 
-    if ( dtmDeathNoticeCount != (lnodes - 1) )
+    if ( dtmDeathNoticeCount != 0 )
     {
         printf( "[%s] Got %d DTM death notifications, expecting %d\n"
-              , MyName, dtmDeathNoticeCount, (dtmProcessCount - 1) );
+              , MyName, dtmDeathNoticeCount, 0 );
         testSuccess = false;
     }
-    if ( tmRestartedNoticeCount != (lnodes - 1) )
+    if ( tmRestartedNoticeCount != 0 )
     {
         printf( "[%s] Got %d DTM restarted notifications, expecting %d\n"
-              , MyName, tmRestartedNoticeCount, (dtmProcessCount - 1) );
+              , MyName, tmRestartedNoticeCount, 0 );
         testSuccess = false;
     }
 
@@ -1067,33 +1042,33 @@
     {
         if ( tracing )
         {
-            printf( "[%s] Killing DTM process on nid=%d, retry=%d\n"
-                  , MyName, DTM_DOWN_NID, i+1 );
+            printf( "[%s] Killing DTM process %s (%d, %d:%d) retry=%d\n"
+                  , MyName
+                  , dtmProcess[DTM_DOWN_NID].procName
+                  , dtmProcess[DTM_DOWN_NID].nid
+                  , dtmProcess[DTM_DOWN_NID].pid
+                  , dtmProcess[DTM_DOWN_NID].verifier, i+1 );
         }
 
         killDTM( DTM_DOWN_NID );
         
         // Wait for the process to be recreated
-        for ( int j = 0; j < 5; j++ )
+        sleep( DTM_RESTART_DELAY );
+        if ( infoDTM( DTM_DOWN_NID ) )
         {
-            if ( infoPersistent( DTM_DOWN_NID ) )
-            {
-                printf( "[%s] Found restarted persistent process %s (%d, %d:%d)\n"
-                      , MyName
-                      , persistentProcess[i].procName
-                      , persistentProcess[i].nid
-                      , persistentProcess[i].pid
-                      , persistentProcess[i].verifier );
-                persistentProcess[i].dead = false;
-                break;
-            }
-            else
-            {
-                printf( "[%s] Failed to find persistent process on node %d\n"
-                      , MyName, i );
-                persistentProcess[i].dead = true;
-                sleep( DTM_RESTART_DELAY );
-            }
+            printf( "[%s] Found restarted persistent process %s (%d, %d:%d)\n"
+                  , MyName
+                  , dtmProcess[DTM_DOWN_NID].procName
+                  , dtmProcess[DTM_DOWN_NID].nid
+                  , dtmProcess[DTM_DOWN_NID].pid
+                  , dtmProcess[DTM_DOWN_NID].verifier );
+            dtmProcess[DTM_DOWN_NID].dead = false;
+        }
+        else
+        {
+            printf( "[%s] Failed to find persistent process on node %d\n"
+                  , MyName, DTM_DOWN_NID );
+            dtmProcess[DTM_DOWN_NID].dead = true;
         }
     }
 
@@ -1120,7 +1095,6 @@
 
 int main (int argc, char *argv[])
 {
-
     bool testSuccess = true;
 
     util.processArgs( argc, argv );
@@ -1142,43 +1116,92 @@
 
     if ( testSuccess )
     {
-        printf( "[%s] Beginning DTM sub-test 1\n", MyName );
+        printf( "[%s] BEGIN DTM sub-test 1\n", MyName );
 
         testSuccess = DTM_test1( );
+
+        printf( "[%s] END DTM sub-test 1, test: ", MyName );
+        if (testSuccess)
+        {
+            printf( "PASS\n" );
+        }
+        else
+        {
+            printf( "FAIL\n" );
+        }
     }
     fflush (stdout );
     if ( testSuccess )
     {
-        printf( "[%s] Beginning DTM sub-test 2\n", MyName );
+        printf( "[%s] BEGIN DTM sub-test 2\n", MyName );
 
         testSuccess = DTM_test2( );
+
+        printf( "[%s] END DTM sub-test 2, test: ", MyName );
+        if (testSuccess)
+        {
+            printf( "PASS\n" );
+        }
+        else
+        {
+            printf( "FAIL\n" );
+        }
     }
     fflush (stdout );
     if ( testSuccess )
     {
-        printf( "[%s] Beginning DTM sub-test 3\n", MyName );
+        printf( "[%s] BEGIN DTM sub-test 3\n", MyName );
 
         testSuccess = DTM_test3( );
+
+        printf( "[%s] END DTM sub-test 3, test: ", MyName );
+        if (testSuccess)
+        {
+            printf( "PASS\n" );
+        }
+        else
+        {
+            printf( "FAIL\n" );
+        }
     }
     fflush (stdout );
     if ( testSuccess )
     {
-        printf( "[%s] Beginning DTM sub-test 4\n", MyName );
+        printf( "[%s] BEGIN DTM sub-test 4\n", MyName );
 
         testSuccess = DTM_test4( );
+
+        printf( "[%s] END DTM sub-test 4, test: ", MyName );
+        if (testSuccess)
+        {
+            printf( "PASS\n" );
+        }
+        else
+        {
+            printf( "FAIL\n" );
+        }
     }
     fflush (stdout );
     if ( testSuccess )
     {
-        printf( "[%s] Beginning DTM sub-test 5\n", MyName );
+        printf( "[%s] BEGIN DTM sub-test 5\n", MyName );
 
         testSuccess = DTM_test5( );
+
+        printf( "[%s] END DTM sub-test 5, test: ", MyName );
+        if (testSuccess)
+        {
+            printf( "PASS\n" );
+        }
+        else
+        {
+            printf( "FAIL\n" );
+        }
     }
     fflush (stdout );
 
     int sendbuf;
     replyMsg_t recvbuf;
-    int rc;
     const int clientTag = 99;
     MPI_Status status;
 
@@ -1189,7 +1212,7 @@
             printf( "[%s] Sending CMD_END to process %s\n"
                   , MyName, dtmProcess[i].procName );
             sendbuf = CMD_END;
-            rc = XMPI_Sendrecv( &sendbuf, 1, MPI_INT, 0, clientTag,
+            XMPI_Sendrecv( &sendbuf, 1, MPI_INT, 0, clientTag,
                 &recvbuf, 1, MPI_INT, MPI_ANY_SOURCE,
                 MPI_ANY_TAG, dtmProcess[i].comm, &status );
         }
@@ -1201,19 +1224,16 @@
                   , dtmProcess[i].procName
                   , dtmProcess[i].nid
                   , dtmProcess[i].pid
-                  , dtmProcess[i].verifier 
-                  , dtmProcess[i].comm 
+                  , dtmProcess[i].verifier
+                  , dtmProcess[i].comm
                   , dtmProcess[i].dead );
-        }
-    }
-    for ( int i=0; i < lnodes; ++i )
-    {
+         }
         if ( persistentProcess[i].comm  != -1 )
         {
             printf( "[%s] Sending CMD_END to process %s\n"
                   , MyName, persistentProcess[i].procName );
             sendbuf = CMD_END;
-            rc = XMPI_Sendrecv( &sendbuf, 1, MPI_INT, 0, clientTag,
+            XMPI_Sendrecv( &sendbuf, 1, MPI_INT, 0, clientTag,
                 &recvbuf, 1, MPI_INT, MPI_ANY_SOURCE,
                 MPI_ANY_TAG, persistentProcess[i].comm, &status );
         }
@@ -1225,20 +1245,20 @@
                   , persistentProcess[i].procName
                   , persistentProcess[i].nid
                   , persistentProcess[i].pid
-                  , persistentProcess[i].verifier 
-                  , persistentProcess[i].comm 
+                  , persistentProcess[i].verifier
+                  , persistentProcess[i].comm
                   , persistentProcess[i].dead );
-        }
-    }
+         }
+     }
 
-    sleep( 5 );
-    printf( "DTM Process Test:\t\t%s\n", (testSuccess) ? "PASSED" : "FAILED" );
+     sleep( 5 );
+     printf( "DTM Process Test:\t\t%s\n", (testSuccess) ? "PASSED" : "FAILED" );
 
-    // tell monitor we are exiting
-    util.requestExit( );
+     // tell monitor we are exiting
+     util.requestExit( );
 
-    XMPI_Close_port( util.getPort( ) );
-    if ( gp_local_mon_io )
+     XMPI_Close_port( util.getPort( ) );
+     if ( gp_local_mon_io )
     {
         delete gp_local_mon_io;
     }
diff --git a/core/sqf/monitor/test/dtmProc.cxx b/core/sqf/monitor/test/dtmProc.cxx
index a830f70..5e67d73 100644
--- a/core/sqf/monitor/test/dtmProc.cxx
+++ b/core/sqf/monitor/test/dtmProc.cxx
@@ -26,6 +26,7 @@
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 #include "clio.h"
 #include "sqevlog/evl_sqlog_writer.h"
 #include "montestutil.h"
@@ -149,15 +150,6 @@
               , recv_msg->u.request.u.death.aborted );
         ++deathNoticesReceived;
     }
-    else if ( recv_msg->type == MsgType_TmRestarted )
-    {
-        printf( "[%s] DTM Restarted in (nid=%d, pnid=%d, name=%s)\n"
-              , MyName
-              , recv_msg->u.request.u.tm_restart.nid
-              , recv_msg->u.request.u.tm_restart.pnid
-              , recv_msg->u.request.u.tm_restart.node_name );
-        ++tmRestartedNoticesReceived;
-    }
     else if ( recv_msg->type == MsgType_NodeDown )
     {
         printf("[%s] Node %d (%s) is DOWN.\n", MyName,
@@ -298,6 +290,8 @@
 
     util.requestStartup( );
 
+    //pause();
+
     // Get and execute commands from controller process
     processCommands( );
 
diff --git a/core/sqf/monitor/test/monpkillall b/core/sqf/monitor/test/monpkillall
new file mode 100755
index 0000000..ee155b6
--- /dev/null
+++ b/core/sqf/monitor/test/monpkillall
@@ -0,0 +1,86 @@
+#!/bin/bash
+#
+# @@@ START COPYRIGHT @@@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# @@@ END COPYRIGHT @@@
+#
+
+# Obtain process ids of any monitors for current user.
+MONPIDLIST=$(ps --no-header -C monitor -o user:12,pid | grep -w ^$USER | awk '{print $2}')
+# Make MONPIDLISTCS a comma separated list of monitor process ids
+MONPIDLISTCS=$(echo $MONPIDLIST | sed 's/\s/,/g')
+
+# Obtain process ids of any watchdog processes for current user.
+WDTPIDLIST=$(ps --sort=cmd,pid -C sqwatchdog -o user:12,pid | grep -w ^$USER | awk '{print $2}')
+
+# Obtain process ids of Trafodion processes (except monitor and watchdog processes)
+if [ "$1" = "-safekill" ]; then
+    SAFE=1
+fi
+SQ_PROCS=$(monpstat -h -s | awk '{print $2}')
+
+# Remove Trafodion processes (except monitor and watchdog processes)
+if [[ -n $SQ_PROCS ]]; then
+    echo $SQ_PROCS | xargs kill -9 2>/dev/null
+fi
+
+# Kill the tcollector processes...
+uid="$(id -u)"
+pkill -9 -u $uid -f tcollector
+pkill -9 -u $uid -f mpstat
+
+# Give kills a second to take effect before looking for orphan processes
+sleep 1
+
+# Remove any other processes (except watchdog process) whose parent
+# process is a monitor (may have orphans if new processes were added
+# to the system but not included in "monpstat" or if the monitor was
+# creating a new process just at the time "monpstat" was executing).
+if [[ -n $MONPIDLISTCS ]]; then
+    ORPHANS=$(ps --no-headers --ppid $MONPIDLISTCS -o pid,cmd | grep -v sqwatchdog | awk '{print $1}')
+fi
+if [[ -n $ORPHANS ]]; then
+    echo $ORPHANS | xargs kill -9 2>/dev/null
+fi
+
+# Remove watchdog and monitor processes if not in safe mode
+if [[ -z $SAFE ]]; then
+    if [[ -n $MONPIDLIST ]]; then
+        echo $MONPIDLIST | xargs kill -9
+    fi
+    if [[ -n $WDTPIDLIST ]]; then
+        echo $WDTPIDLIST | xargs kill -9
+    fi
+
+    # Remove Trafodion processes (except monitor and watchdog processes).
+    # This repeats the earlier steps.  Generally will be few if any Trafodion
+    # processes remaining.   Those remaining are typically persistent processes
+    # that got recreated by the monitor before it was killed.
+    SQ_PROCS=$(monpstat -h -s | awk '{print $2}')
+    if [[ -n $SQ_PROCS ]]; then
+        echo $SQ_PROCS | xargs kill -9 2>/dev/null
+    fi
+fi
+
+# kill all running vili workflows
+ps -u $USER -o pid,cmd | grep " -vili" | awk '{print $1}' | xargs -r kill -9 2>/dev/null
+
+rm -f /dev/shm/sem.rms.`id -u`.* 2>/dev/null
+rm -f /dev/shm/sem.monitor*$USER 2>/dev/null
diff --git a/core/sqf/monitor/test/monpstat b/core/sqf/monitor/test/monpstat
new file mode 100755
index 0000000..a6ee5c2
--- /dev/null
+++ b/core/sqf/monitor/test/monpstat
@@ -0,0 +1,137 @@
+#!/bin/bash
+# @@@ START COPYRIGHT @@@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# @@@ END COPYRIGHT @@@
+#
+# Produce process status for Trafodion processes.
+# The output goes to stdout and may be displayed to the user or used
+# by other scripts that need to obtain a list of all Trafodion
+# processes.
+#
+# The list of Trafodion programs in variable PROGS below must include
+# all Trafodion programs.   Addition of a new Trafodion program requires
+# a corresponding addition to PROGS.
+#
+
+# Distinguished program commands for benefit of pkillall
+SAFEPROGS=monitor,sqwatchdog,memcheck-amd64-linux
+
+# Trafodion program and script command names.
+# Add new Trafodion programs and scripts to this list.
+# Observe the following:
+# 1. KEEP THE LIST IN ALPHABETICAL ORDER: A-Z then a-z
+# 2. Separate each entry with a comma.
+# 3. NO SPACES WHATSOEVER; this list is an argument to ps -C.
+#PROGS=dtmProc\
+PROGS=dtm\
+,childExitCtrl\
+,childExitParent\
+,childExitChild\
+,client\
+,server\
+,regTestCtrl\
+,deathNotice\
+,deathUnreg\
+,deathWatch\
+,persistentProc\
+,procCreate\
+,dtmCtrl\
+,dtmProc\
+,spxCtrl\
+,spxProc\
+,tmSyncTest\
+,tmSyncCtrl\
+,mpirun\
+,shell\
+,sqcheck\
+,sqregck\
+,sqstart\
+,trafns\
+,pstartd
+
+function usage {
+
+typeset MYNAME="$(basename $0)"
+cat <<EOF
+
+usage:  $MYNAME [-h] [-l] [-s] [-H]
+
+-h      Suppress header
+
+-l      List Trafodion programs.  When -s is also used,
+        list only programs used in safe mode.
+
+-s      Safe mode, does not show distinguished programs:
+                $SAFEPROGS
+
+-H      Show help text
+
+EOF
+}
+
+SUPPRESS_HEADER=0
+LIST=0
+while getopts ":hlsH" Option
+do
+  case $Option in
+    h) SUPPRESS_HEADER=1 ;;
+    l) LIST=1 ;;
+    s) SAFE=1 ;;
+    H) usage
+       exit 0;;
+  esac
+done
+shift $(($OPTIND - 1))
+
+if [[ -z $SAFE ]]; then
+  PROGS="$PROGS,$SAFEPROGS"
+fi
+
+if (( LIST == 1 )); then
+    echo "$PROGS"
+  exit 0
+fi
+
+if (( SUPPRESS_HEADER == 0 )); then
+  echo "uid          pid   ppid  wchan   rss   vsz   time     stat cmd"
+  echo "---          ---   ----  -----   ---   ---   ----     ---- ---"
+fi
+
+PS_OUTPUT_FORMAT="user:12,pid,ppid,wchan,rss,vsz,time,stat,cmd"
+ps ww --sort=cmd,pid -C $PROGS -o ${PS_OUTPUT_FORMAT} | grep -w ^$USER | grep -v '<defunct>'$
+RES=$?
+
+# Look for tcollector processes
+uid="$(id -u)"
+pgrep -u ${uid} -f python |  xargs ps --sort=cmd,pid -o ${PS_OUTPUT_FORMAT} | grep tcollector | grep -v grep | grep -v '<defunct>'$
+if (($? == 0)) || ((RES == 0)); then
+  RES=0
+else
+  RES=1
+fi 
+
+#Add additional Java classes to search for in the following env variable (separated by a '|' character)
+JAVA_CLASSES='(net.opentsdb.tools.TSDMain|org.trafodion.sql.LittleJetty)'
+ps ww --sort=cmd,pid -C java -o ${PS_OUTPUT_FORMAT} | grep -w ^$USER | grep -v '<defunct>'$ | egrep ${JAVA_CLASSES} | cut -b1-200
+if (($? == 0)) || ((RES == 0)); then
+  exit 0
+else
+  exit 1
+fi 
diff --git a/core/sqf/monitor/test/monshell b/core/sqf/monitor/test/monshell
new file mode 100755
index 0000000..b9872bf
--- /dev/null
+++ b/core/sqf/monitor/test/monshell
@@ -0,0 +1,29 @@
+#!/bin/bash
+#
+# @@@ START COPYRIGHT @@@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# @@@ END COPYRIGHT @@@
+#
+
+if [ -f $TRAF_VAR/monshell.env ]; then
+    . $TRAF_VAR/monshell.env
+fi
+
+shell $1 $2 $3 $4 $5 $6 $7 $8 $9
diff --git a/core/sqf/monitor/test/montestgen b/core/sqf/monitor/test/montestgen
new file mode 100755
index 0000000..a570fdb
--- /dev/null
+++ b/core/sqf/monitor/test/montestgen
@@ -0,0 +1,367 @@
+#!/bin/bash
+
+# @@@ START COPYRIGHT @@@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# @@@ END COPYRIGHT @@@
+#
+# montestgen script - generates various files
+
+#SQCONFIG_FILE=$TRAF_CONF/sqconfig
+#SQCONFIG_PERSIST_FILE=$TRAF_CONF/sqconfig.persist
+
+SQCONFIG_FILE=$TRAF_HOME/monitor/test/sqconfig.monitor.virtual
+SQCONFIG_PERSIST_FILE=$TRAF_HOME/monitor/test/sqconfig.persist
+
+function Usage {
+    script_name=`basename $0`
+    echo
+    echo $script_name generates various Trafodion files in the $TRAF_VAR directory.
+    echo
+    echo "Usage: $script_name [ -? | -h ] [<sqconfig_filename> <sqconfig_persist_filename>]"
+    echo "  -?    Help"
+    echo "  -h    Help"
+    echo "  <sqconfig_filename>         Name of the SQ config file (defaults to $SQCONFIG_FILE)"
+    echo "  <sqconfig_persist_filename> Name of the SQ config persist file (defaults to $SQCONFIG_PERSIST_FILE)"
+    echo
+    exit 1;
+}
+
+
+
+function GetSQcnfg {
+   # Get SQ Node configuration
+   TempList=`grep -o 'node-name=.[A-Za-z0-9\.\-]*' $SQCONFIG_FILE | cut -d "=" -f 2 | cut -d ";" -f 1 | sort -u`
+
+   i=0
+   for NODE in $TempList
+     do
+       if [[ ${NODE%%.*} == "$(hostname -s)" ]]; then
+         continue
+       fi
+       SQNodeNames[$i]=$NODE
+       ((i=i+1))
+
+     done
+
+   # Check that the Node names were correctly added
+   ExNodeList="-w $(echo ${SQNodeNames[@]} | sed -e 's/ / -w '/g)"
+
+   if [[ -n "$ExNodeList" ]]; then   
+      echo "$ExNodeList"
+   else
+      echo
+      echo "Could not parse $SQCONFIG_FILE file."
+      echo "Please ensure sqenv.sh has been sourced and the sqconfig file is valid.  Then, re-run sqgen."
+      echo
+      exit 1;
+   fi
+}
+
+###########################################################
+# MAIN portion of sqgen begins
+###########################################################
+
+FT_FLAG=1
+PERF_FLAG=0
+SQCONFIG_DB_FILE=$TRAF_VAR/sqconfig.db
+
+if [ -z $TRAF_HOME ]; then
+    echo
+    echo "The TRAF_HOME environment variable does not exist."
+    echo "Please ensure sqenv.sh has been sourced."
+    echo
+    exit 1;
+fi
+
+# Check whether the SQ environment is already up.
+sqcheck -i 1 -d 1 > /dev/null 2>&1
+sq_stat=$?
+if [[ $sq_stat == 0 ]]; then
+   echo "SQ environment is up. sqgen not executed. Exiting..."
+   exit 1
+elif [[ $sq_stat == 1 ]]; then
+   echo "SQ environment is partially up. sqgen not executed. Exiting..."
+   exit 1
+elif [[ $sq_stat == 2 ]]; then
+   echo "SQ environment is partially up. sqgen not executed. Exiting..."
+   exit 1
+fi
+
+# Make sure sqgen uses the latest environment
+cd $TRAF_HOME
+if [ "$SQ_BUILD_TYPE" == "release" -a -f sqenvr.sh ]; then
+  . ./sqenvr.sh
+else
+  . ./sqenv.sh
+fi
+
+if [ -n "$CHANGED_SQ_ENV_RESTART_SHELL" ]; then
+  # This is set by sqenvcom.sh when the environment has changed
+  # in the shell. The CLASSPATH and other variables are not reliable
+  # in such a situation and we should not do an sqgen.
+  echo "A change in environment variables occurred."
+  echo "Please retry sqgen in a new shell. Exiting..."
+  exit 1
+fi
+
+cd $TRAF_HOME/monitor/test
+
+if [[ ! -h $TRAF_HOME/monitor/test/sqconfigdb.pm ]]; then
+    ln -s $TRAF_HOME/sql/scripts/sqconfigdb.pm sqconfigdb.pm
+fi
+if [[ ! -h $TRAF_HOME/monitor/test/sqnameserver.pm ]]; then
+    ln -s $TRAF_HOME/sql/scripts/sqnameserver.pm sqnameserver.pm
+fi
+if [[ ! -h $TRAF_HOME/monitor/test/sqnodes.pm ]]; then
+    ln -s $TRAF_HOME/sql/scripts/sqnodes.pm sqnodes.pm
+fi
+if [[ ! -h $TRAF_HOME/monitor/test/sqpersist.pm ]]; then
+    ln -s $TRAF_HOME/sql/scripts/sqpersist.pm sqpersist.pm
+fi
+if [[ ! -h $TRAF_HOME/monitor/test/sqpersist.pm ]]; then
+    ln -s $TRAF_HOME/sql/scripts/sqpersist.pm sqpersist.pm
+fi
+
+# Check to make sure this is a real cluster configuration
+if  [[ -n "$node_count" ]] && [[ "$node_count" -gt "1" ]]  ; then
+    #
+    # NOTE: Temporarily bypass the following check and always recreate the
+    #       'sqconfig.db' file on the local node when invoked from an
+    #       agent like (CM).
+    #
+    #       Re-enable once the Trafodion Configuration database supports a 
+    #       non-SQLite storage method as well as support for the node add function.
+    #
+    if [[ -z ${TRAF_AGENT} ]]; then
+        if  [[ -n "$node_count" ]] && [[ "$node_count" -gt "1" ]]; then
+            # The configuration database can only be created the first time.
+            # Since nodes can be added and deleted through the 'shell node add/delete'
+            # commands, the 'sqconfig' node section may not reflect the current node
+            # configured. The 'sqregen' script can be used to update the SQCONFIG_FILE
+            # node section to reflect the current node configuration stored in
+            # the SQCONFIG_DB_FILE.
+            echo "Checking for the configuration file ($SQCONFIG_DB_FILE)."
+            if [ -f $SQCONFIG_DB_FILE ]; then
+                echo
+                echo "The configuration file ($SQCONFIG_DB_FILE) exists."
+                echo "Use the 'sqshell node add/delete' commands to change the node membership in the configuration database."
+                echo "Use the 'sqshell persist add/delete' commands to change the persist object configuration in the configuration database."
+                echo "Use the 'sqregen -node' script to update the $SQCONFIG_FILE 'node' section with the"
+                echo "current node membership in the configuration database."
+                echo "Use the 'sqregen -persist' script to update the $SQCONFIG_FILE 'persist' section with the"
+                echo "current persist configuration stored in the configuration database."
+                exit 1
+            fi
+        fi
+    else
+        # Temporary removal of configuration database file
+        rm -f $SQCONFIG_DB_FILE
+    fi
+
+    GetSQcnfg
+else
+    #echo "node_count=${node_Count}"
+    #echo
+    echo "Workstation environment - Not a clustered environment"
+    if [ -f $SQCONFIG_DB_FILE ]; then
+        echo
+        echo "The configuration file ($SQCONFIG_DB_FILE) was previously created."
+        echo "Removing and re-creating $SQCONFIG_DB_FILE!"
+        rm -f $SQCONFIG_DB_FILE
+    fi
+fi
+
+# Assume option is SQCONFIG_FILE
+while [ $# != 0 ]
+  do
+    flag="$1"
+    case "$flag" in
+        -h)  Usage ;;
+        -?)  Usage ;;
+        *)   SQCONFIG_FILE=$1
+             echo "SQCONFIG_FILE=$SQCONFIG_FILE"
+             shift
+             SQCONFIG_PERSIST_FILE=$1
+             echo "SQCONFIG_PERSIST_FILE=$SQCONFIG_PERSIST_FILE"
+             ;;
+    esac
+    shift
+  done
+
+if [[ ! -f $SQCONFIG_FILE ]]
+then
+  echo "Non existent sqconfig file: $SQCONFIG_FILE"
+  echo "Using default sqconfig file"
+  cp -f $TRAF_HOME/sql/scripts/sqconfig $SQCONFIG_FILE
+fi
+if [[ ! -f $SQCONFIG_PERSIST_FILE ]]
+then
+  echo "Non existent sqconfig persist file: $SQCONFIG_PERSIST_FILE"
+  echo "Using default sqconfig persist file"
+  cp -f $TRAF_HOME/sql/scripts/sqconfig.persist $SQCONFIG_PERSIST_FILE
+fi
+
+export SQLOG_DIR=$TRAF_LOG
+mkdir -p $TRAF_VAR
+mkdir -p $SQLOG_DIR
+mkdir -p $MPI_TMPDIR
+mkdir -p $MPI_TMPDIR/tmp
+# mkdir a dir for CBF data used by SQL IUS feature
+mkdir -p $HOME/cbfs
+
+# Clean HBase classpath cache file
+#echo "Clean up HBase classpath cache file: $TRAF_VAR/hbase_classpath"
+rm -rf $TRAF_VAR/hbase_classpath
+
+# Bypass if in agent mode
+if [[ -z ${TRAF_AGENT} ]]; then
+    if  [[ -n "$node_count" ]] && [[ "$node_count" -gt "1" ]]; then    
+        echo
+        echo "Creating directories on cluster nodes"
+    
+        # Clean HBase classpath cache file on all nodes
+        edb_pdsh $ExNodeList rm -rf $TRAF_VAR/hbase_classpath
+
+        echo "edb_pdsh $ExNodeList mkdir -p $TRAF_VAR "
+        edb_pdsh $ExNodeList mkdir -p $TRAF_VAR
+        
+        echo "edb_pdsh $ExNodeList mkdir -p $SQLOG_DIR "
+        edb_pdsh $ExNodeList mkdir -p $SQLOG_DIR
+    
+        echo "edb_pdsh $ExNodeList mkdir -p $MPI_TMPDIR "
+        edb_pdsh $ExNodeList mkdir -p $MPI_TMPDIR
+    
+        echo "edb_pdsh $ExNodeList mkdir -p $MPI_TMPDIR/tmp "
+        edb_pdsh $ExNodeList mkdir -p $MPI_TMPDIR/tmp
+    
+        echo "edb_pdsh $ExNodeList mkdir -p $PWD "
+        edb_pdsh $ExNodeList mkdir -p $PWD
+    
+    fi
+fi
+
+SQSCRIPT_FILE=./gomon
+SQESPENV_FILE=$TRAF_VAR/tdm_arkesp.env
+
+#echo
+if [ -f $TRAF_VAR/ms.env ]; then
+    echo "The SQ environment variable file $TRAF_VAR/ms.env exists."
+    echo "The file will not be re-generated."
+    echo
+else
+    echo "Generating environment variable file: $TRAF_VAR/ms.env"
+    echo
+    ./genms > $TRAF_VAR/ms.env
+    lv_retcode=$?
+    if [[ $lv_retcode != 0 ]]; then 
+        echo "Error $lv_retcode while executing genms. Exiting..."
+        exit $lv_retcode
+    fi
+fi
+
+#//./gensqstatem2lenv > $TRAF_VAR/sqstatem2l.env
+
+# Create configuration database tables
+sqlite3 -init $TRAF_HOME/sql/scripts/createConfigDb $SQCONFIG_DB_FILE <<eof 
+.quit
+eof
+
+echo ./montestgen.pl $SQSCRIPT_FILE `hostname` $FT_FLAG $PERF_FLAG $SQCONFIG_FILE $SQCONFIG_PERSIST_FILE
+./montestgen.pl $SQSCRIPT_FILE `hostname` $FT_FLAG $PERF_FLAG $SQCONFIG_FILE $SQCONFIG_PERSIST_FILE
+sq_stat=$?
+if [[ $sq_stat != 0 ]]; then 
+    exit $sq_stat;
+fi
+
+sq_seamonster=$SQ_SEAMONSTER
+if [ -f $SQESPENV_FILE ]; then
+    if [[ $sq_seamonster == 1 ]]; then 
+        echo
+        echo "Enabling tdm_arkesp.env file in $TRAF_VAR/ms.env"
+        echo "cat $TRAF_VAR/ms.env | sed -e "s@^# SQ_PROPS_TDM_ARKESP=tdm_arkesp.env@SQ_PROPS_TDM_ARKESP=tdm_arkesp.env@" > $TRAF_VAR/ms.env.TEMP"
+        if [ -f $TRAF_VAR/ms.env.TEMP ]; then
+            rm $TRAF_VAR/ms.env.TEMP
+        fi
+        cat $TRAF_VAR/ms.env | sed -e "s@^# SQ_PROPS_TDM_ARKESP=tdm_arkesp.env@SQ_PROPS_TDM_ARKESP=tdm_arkesp.env@" > $TRAF_VAR/ms.env.TEMP
+        cp $TRAF_VAR/ms.env.TEMP $TRAF_VAR/ms.env
+        rm $TRAF_VAR/ms.env.TEMP
+    else
+        echo
+        echo "Disabling tdm_arkesp.env file in $TRAF_VAR/ms.env"
+        echo "cat $TRAF_VAR/ms.env | sed -e "s@^SQ_PROPS_TDM_ARKESP=tdm_arkesp.env@# SQ_PROPS_TDM_ARKESP=tdm_arkesp.env@" > $TRAF_VAR/ms.env.TEMP"
+        if [ -f $TRAF_VAR/ms.env.TEMP ]; then
+            rm $TRAF_VAR/ms.env.TEMP
+        fi
+        cat $TRAF_VAR/ms.env | sed -e "s@^SQ_PROPS_TDM_ARKESP=tdm_arkesp.env@# SQ_PROPS_TDM_ARKESP=tdm_arkesp.env@" > $TRAF_VAR/ms.env.TEMP
+        cp $TRAF_VAR/ms.env.TEMP $TRAF_VAR/ms.env
+        rm $TRAF_VAR/ms.env.TEMP
+    fi
+fi
+
+# Bypass if in agent mode
+if [[ -z ${TRAF_AGENT} ]]; then
+    if  [[ -n "$node_count" ]] && [[ "$node_count" -gt "1" ]]; then    
+        echo
+        echo
+        echo "Copying the generated files to all the nodes in the cluster"
+        echo
+        echo "Copying $TRAF_VAR/ms.env to $TRAF_VAR of all the nodes"
+        echo "$PDCP $ExNodeList $TRAF_VAR/ms.env   $TRAF_VAR "
+        $PDCP $ExNodeList $TRAF_VAR/ms.env   $TRAF_VAR
+
+
+        echo
+        echo "Copying $TRAF_VAR/seamonster.env to $TRAF_VAR of all the nodes"
+        echo "$PDCP $ExNodeList $TRAF_VAR/seamonster.env   $TRAF_VAR "
+        $PDCP $ExNodeList $TRAF_VAR/seamonster.env   $TRAF_VAR
+
+        if [[ $sq_seamonster == 1 ]]; then 
+            if [ -f $SQESPENV_FILE ]; then
+                echo
+                echo "Copying $SQESPENV_FILE to $TRAF_VAR of all the nodes"
+                echo "$PDCP $ExNodeList $SQESPENV_FILE   $TRAF_VAR "
+                $PDCP $ExNodeList $SQESPENV_FILE   $TRAF_VAR
+            fi
+        fi
+
+        echo
+        echo "Copying Trafodion Configuration files to $PWD"
+
+        echo "$PDCP $ExNodeList $SQCONFIG_FILE $SQCONFIG_PERSIST_FILE $SQCONFIG_DB_FILE $PWD "
+        $PDCP $ExNodeList $SQCONFIG_FILE $SQCONFIG_PERSIST_FILE $SQCONFIG_DB_FILE $PWD
+
+        echo
+    fi
+fi
+
+# mkdir a dir for CBF data used by SQL IUS feature
+mkdir -p $HOME/cbfs
+
+echo
+echo "******* Generate public/private certificates *******"
+echo
+if (test -f $TRAF_HOME/sql/scripts/sqcertgen); then
+  $TRAF_HOME/sql/scripts/sqcertgen 2>/dev/null
+else
+  echo
+  echo "ERROR: Certificate generation script (sqcertgen) does not exist in $TRAF_HOME/sql/scripts folder"
+  echo
+  exit 1
+fi
diff --git a/core/sqf/monitor/test/montestgen.pl b/core/sqf/monitor/test/montestgen.pl
new file mode 100755
index 0000000..f4eb6ef
--- /dev/null
+++ b/core/sqf/monitor/test/montestgen.pl
@@ -0,0 +1,776 @@
+#!/usr/bin/perl
+#
+# @@@ START COPYRIGHT @@@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# @@@ END COPYRIGHT @@@
+#
+use sqconfigdb;
+use sqnameserver;
+use sqnodes;
+use sqpersist;
+use POSIX;
+
+# Process types.  Must match values defined by the monitor in msgdef.h.
+my $ProcessType_Undefined = 0;
+my $ProcessType_DTM       = 2;
+my $ProcessType_Generic   = 4;
+my $ProcessType_SSMP      = 11;
+
+my $gDebug = 0;
+
+my $bVirtualNodes=0;
+
+$gRoleEnumStorage     = "storage";
+$gRoleEnumEdge        = "connection";
+$gRoleEnumAggregation = "aggregation";
+
+my @g_ssdOverflow = ();
+my @g_hddOverflow = ();
+
+my $gdNumNodes=0;
+my $gdZoneId=0;
+
+my @gNodeIdToZoneIdIndex = ();
+
+my @g_zonelist = ();
+
+my $gdNumCpuCores = 1;
+
+my $g_CCFormat = 2;
+
+my $gbInitialLinesPrinted = 0;
+my $gbOverflowLinesPrinted = 0;
+
+my $gShellStarted=0;
+
+my $gFloatingExternalIp = "";
+my $gFloatingNodeId = -1;
+my $gFloatingFailoverNodeId = -1;
+
+
+my $TRAF_HOME = $ENV{'TRAF_HOME'};
+my $HOME = $ENV{'HOME'};
+my $MPI_TMPDIR = $ENV{'MPI_TMPDIR'};
+my $SQ_SEAMONSTER = $ENV{'SQ_SEAMONSTER'};
+my $SQ_TRANS_SOCK = $ENV{'SQ_TRANS_SOCK'};
+my $SQ_DTM_PERSISTENT_PROCESS = $ENV{'SQ_DTM_PERSISTENT_PROCESS'};
+my $SQ_IDTMSRV = $ENV{'SQ_IDTMSRV'};
+my $SQ_SRVMON = $ENV{'SQ_SRVMON'};
+my $SQ_TNOTIFY = $ENV{'SQ_TNOTIFY'};
+my $TM_ENABLE_MONARCH = $ENV{'TM_ENABLE_MONARCH'};
+
+# define the error values that are being returned
+my $BDR_ERROR = 70;
+
+
+my $g_insDbUniqStrStmt = 0;
+
+sub printScript {
+    ($dWhich, @rest) = @_;
+
+    if ($dWhich <= 1) {
+        print SQS @rest;
+    }
+
+    if ($dWhich >= 1) {
+        print SQW @rest;
+    }
+}
+
+sub printIDTMScript {
+    ($dWhich, @rest) = @_;
+    print IDTM @rest;
+}
+
+sub getTime {
+    return strftime("%a %b %d %H:%M:%S %Y\n", localtime(time));
+}
+
+sub printTime {
+    printScript(1, "# Trafodion Startup script generated @ ",getTime(),"\n");
+}
+
+sub printInitialLines {
+
+    # So we don't re-print the initial lines
+    if ($gbInitialLinesPrinted) {
+        return;
+    }
+
+    printScript(1, "#!/bin/bash \n");
+    printTime;
+
+#    $smenv = "$ENV{'TRAF_VAR'}/seamonster.env";
+#    if ( -e $smenv ) {
+#      print "\nThe SeaMonster environment variable file $smenv exists.\n";
+#      print "The file will not be re-generated.\n\n";
+#    } else {
+#      print "\nGenerating SeaMonster environment variable file: $smenv\n\n";
+#      #Create SeaMonster environment variable file
+#      open (ETC,">>$smenv")
+#          or die("unable to open $smenv");
+#      if ($bVirtualNodes == 1) {
+#          print ETC "SM_VIRTUALNODE=1\n";
+#      }
+#      if (!$ENV{'SHARED_HARDWARE'} || $ENV{SHARED_HARDWARE} eq 'YES') {
+#          print ETC "SM_PIPEDEPTH=6\n";
+#          print ETC "SM_LOWATER=3\n";
+#          print ETC "SM_BUFFSIZE=102400\n";
+#          print ETC "SM_THRESHOLD_NBYTES=51200\n";
+#      }
+#      close(ETC);
+#    }
+
+    $msenv = "$ENV{'TRAF_VAR'}/ms.env";
+
+    open (ETC,">>$msenv")
+        or die("unable to open $msenv");
+
+    if ($SQ_TRANS_SOCK == 1) {
+        print ETC "SQ_TRANS_SOCK=1\n";
+    }
+    else {
+        print ETC "SQ_TRANS_SOCK=0\n";
+    }
+
+    if ($TM_ENABLE_MONARCH == 1) {
+        print ETC "TM_ENABLE_MONARCH=1\n";
+    }
+    else {
+        print ETC "TM_ENABLE_MONARCH=0\n";
+    }
+
+    if ($bVirtualNodes == 1) {
+        $virtualnode_string = "SQ_VIRTUAL_NODES=$gdNumNodes\n";
+        $virtualnid_string = "SQ_VIRTUAL_NID=0\n";
+        printScript(1, "export $virtualnode_string");
+        printScript(1, "export $virtualnid_string");
+
+        print ETC "$virtualnode_string";
+        print ETC "$virtualnid_string";
+           # Allow specific mirroring ON override for virtual node
+        print ETC "MS_STREAMS_MIN=20000\n";
+        print ETC "MS_STREAMS_MAX=20000\n";
+    }
+    # Cluster
+    else {
+        print ETC "MS_STREAMS_MIN=20000\n";
+        print ETC "MS_STREAMS_MAX=20000\n";
+        $hugePages=`cat /proc/sys/vm/nr_hugepages`;
+        if ($hugePages != 0) {
+           if ($ENV{SHARED_HARDWARE} eq 'YES') {
+              print ETC "SQ_RMS_ENABLE_HUGEPAGES=0\n"; }
+           else {
+              print ETC "SQ_RMS_ENABLE_HUGEPAGES=1\n"; }
+        }
+
+        else {
+            print ETC "SQ_RMS_ENABLE_HUGEPAGES=0\n";
+        }
+    }
+
+    print ETC "CLASSPATH=$ENV{'CLASSPATH'}:\n";
+    close (ETC);
+
+#    printScript(1, "\nshell <<eof \n");
+#    $gShellStarted=1;
+
+#    printScript(1, "\n");
+
+#    printScript(1, "! Start the monitor processes across the cluster\n");
+
+#    printScript(1, "startup\n");
+
+#    genSQShellExit();
+
+#    printScript(1, "\nsqcheckmon\n");
+#    printScript(1, "let lv_checkmon_ret=\$\?\n");
+#    printScript(1, "if [ \$lv_checkmon_ret '==' 0 ]; then\n");
+#    printScript(1, "   echo \"Continuing with the Startup...\"\n");
+#    printScript(1, "   echo\n");
+#    printScript(1, "else\n");
+#    printScript(1, "   echo \"Aborting startup.\"\n");
+#    printScript(1, "   more \$TRAF_LOG/sqcheckmon.log\n");
+#    printScript(1, "   exit 1\n");
+#    printScript(1, "fi\n");
+
+
+#    genSQShellStart();
+
+#    if ($bVirtualNodes == 0) {
+#        printScript(1, "\nset CLUSTERNAME=\$CLUSTERNAME\n");
+#    }
+#    printScript(1, "\nset SQ_MBTYPE=$ENV{'SQ_MBTYPE'}\n");
+#    printScript(1, "\nset JAVA_HOME=\$JAVA_HOME\n");
+#    printScript(1, "\nset TRAF_CLUSTER_ID=\$TRAF_CLUSTER_ID\n");
+#    printScript(1, "\nset TRAF_INSTANCE_ID=\$TRAF_INSTANCE_ID\n");
+#    printScript(1, "\nset TRAF_FOUNDATION_READY=0\n");
+
+#    sqconfigdb::addDbClusterData( "SQ_MBTYPE", $ENV{'SQ_MBTYPE'});
+#    sqconfigdb::addDbClusterData( "TRAF_HOME", "$TRAF_HOME"); # comes out null
+#    sqconfigdb::addDbClusterData( "JAVA_HOME", "$JAVA_HOME"); 
+#    sqconfigdb::addDbClusterData( "TRAF_CLUSTER_ID", "$TRAF_CLUSTER_ID");
+#    sqconfigdb::addDbClusterData( "TRAF_INSTANCE_ID", "$TRAF_INSTANCE_ID");
+#    sqconfigdb::addDbClusterData( "TRAF_FOUNDATION_READY", "0"); 
+
+#    genSQShellExit();
+
+    $gbInitialLinesPrinted = 1;
+}
+
+sub printOverflowLines {
+    if($gbOverflowLinesPrinted) {
+        return;
+    }
+
+    $msenv = "$ENV{'TRAF_VAR'}/ms.env";
+
+    open (ETC,">>$msenv")
+        or die("unable to open $msenv");
+
+    if(@g_ssdOverflow) {
+        $ssdDir = join(':',@g_ssdOverflow);
+        print ETC "STFS_SSD_LOCATION=$ssdDir\n";
+    }
+
+    if(@g_hddOverflow) {
+        $hddDir = join(':',@g_hddOverflow);
+        print ETC "STFS_HDD_LOCATION=$hddDir\n";
+    }
+    close(ETC);
+
+    $gbOverflowLinesPrinted = 1;
+}
+
+sub printScriptEndLines {
+
+    printScript(1, "\n");
+    printScript(1, "exit 0\n");
+}
+
+#sub genSQShellExit {
+#
+#    if ($gShellStarted == 1) {
+#        printScript(1, "\n");
+#        printScript(1, "exit\n");
+#        printScript(1, "eof\n");
+#
+#        $gShellStarted = 0;
+#    }
+#}
+
+#sub genSQShellStart {
+#
+#    if ($gShellStarted == 0) {
+#        printScript(1, "\n");
+#        printScript(1, "sqshell -a <<eof\n");
+#
+#        $gShellStarted = 1;
+#    }
+#}
+
+sub printSQShellCommand {
+    printScript(1, substr($_,1));
+}
+
+sub executeShellCommand {
+    $lv_cmd=substr($_,1);
+    print "Command: $lv_cmd";
+    $lv_cmd_output=`$lv_cmd` ;
+    print $lv_cmd_output;
+
+}
+
+
+sub processNameserver {
+    my $err = 0;
+    while (<>) {
+        if (/^begin name-server/) {
+        }
+        elsif (/^end name-server/) {
+            if (sqnameserver::validateNameserver() != 0) {
+                $err = 1;
+            }
+            if ($err != 0) {
+                print "   Error: not a valid name-server configuration statement.\n";
+                print "Exiting without generating sqconfig.db due to errors.\n";
+                exit 1;
+            }
+            return;
+        }
+        else {
+            if (sqnameserver::parseStmt() != 0) {
+                $err = 1;
+            }
+        }
+    }
+}
+
+sub processNodes {
+    my $bNodeSpecified = 0;
+
+    while (<>) {
+        next if (/^$/);
+        next if (/^#/);
+        if (/^_virtualnodes/) {
+            @words=split(' ',$_);
+            $gdNumNodes=@words[1];
+            $bVirtualNodes=1;
+            my $l_dNodeIndex = 0;
+
+            print "Generating virtual configuration database, node-name=$g_HostName, virtual nodes count=$gdNumNodes\n";
+
+            sqnodes::genVirtualConfigDb( $g_HostName, $gdNumNodes );
+            for ($l_dNodeIndex = 0; $l_dNodeIndex < $gdNumNodes; $l_dNodeIndex++) {
+
+                $gNodeIdToZoneIdIndex[$l_dNodeIndex] = $l_dNodeIndex;
+
+                push(@g_EdgeNodes, $l_dNodeIndex);
+            }
+        }
+        elsif (/^end node/) {
+
+            # Just for the time being - this should be an error
+            if (($bNodeSpecified == 0) &&
+                ($bVirtualNodes == 0)) {
+                $gdNumNodes = 1;
+            }
+
+            if ($bVirtualNodes == 0)
+            {
+                if (sqnodes::validateConfig() == 0)
+                {   # Valid configuration, generate sqconfig.db
+                    $gdNumNodes = sqnodes::numNodes();
+
+                    sqnodes::genConfigDb( );
+                }
+
+                my $lv_numEdgeNodes = sqnodes::getNumberOfConnNodes();
+                my $lv_node_index = 0;
+                for ($i=1; $i <= $lv_numEdgeNodes; $i++) {
+                    $lv_node_index = sqnodes::getConnNode($i);
+                    push(@g_EdgeNodes, $lv_node_index);
+                }
+
+                for ($i=0; $i < $gdNumNodes; $i++) {
+                    push(@g_BackupTSENode, $i);
+                }
+            }
+
+            return;
+        }
+        else {
+            if (sqnodes::parseStmt() == 0) {
+                $bNodeSpecified = 1;
+                $gdZoneId++;
+            }
+            else {
+                print "   Error: not a valid node configuration statement.\n";
+                print "Exiting without generating sqconfig.db due to errors.\n";
+                exit 1;
+            }
+        }
+    }
+}
+
+sub processPersist {
+    my $err = 0;
+    while (<>) {
+        if (/^begin persist/) {
+        }
+        elsif (/^end persist/) {
+            if (sqpersist::validatePersist() != 0) {
+                $err = 1;
+            }
+            if ($err != 0) {
+                print "   Error: not a valid persist configuration statement.\n";
+                print "Exiting without generating sqconfig.db due to errors.\n";
+                exit 1;
+            }
+            return;
+        }
+        else {
+            if (sqpersist::parseStmt() != 0) {
+                $err = 1;
+            }
+        }
+    }
+}
+
+sub printZoneList {
+
+    if (!$gDebug) {
+        return;
+    }
+
+    my $i = 0;
+
+    print "Number of nodes: ", $#g_zonelist + 1, "\n";
+    print "Current Zone ID = $gdZoneId\n";
+
+    for ($i = 0; $i <= $#g_zonelist; $i++) {
+        print "g_zonelist[$i]=", $g_zonelist[$i], "\n";
+    }
+}
+
+sub processNotifyEmail {
+
+    open (ETC,">>$msenv")
+    or die("unable to open $msenv");
+
+    my @l_line = ();
+    while (<>) {
+        if(/^enable/) {
+            @l_line = split(' ', $_);
+            print ETC "NOTIFY_MAIL_ENABLE=@l_line[1]\n"
+        }
+        elsif(/^severity_level/) {
+            @l_line = split(' ', $_);
+            print ETC "NOTIFY_MAIL_SEVERITY_LEVEL=@l_line[1]\n"
+        }
+        elsif(/^subject_prefix/) {
+            @l_line = split(' ', $_);
+            print ETC "NOTIFY_MAIL_SUBJECT_PREFIX=";
+            for ($i = 1; $i <= $#l_line; $i++) {
+                if ($i > 1) {
+                    print ETC " ";
+                }
+                print ETC "@l_line[$i]";
+            }
+            print ETC "\n";
+        
+        }
+        elsif(/^receiver_address/) {
+            @l_line = split(' ', $_);
+            print ETC "NOTIFY_MAIL_TO=@l_line[1]\n"
+        }
+        elsif(/^end notify_email/) {
+            close(ETC);
+            return;
+        }
+    }
+}
+
+sub processOverflow {
+    while (<>) {
+        if(/^ssd/) {
+            @ssdLine = split(' ',$_);
+            if(@ssdLine[1]) {
+                push(@g_ssdOverflow, @ssdLine[1]);
+            }
+        }
+        elsif(/^hdd/) {
+            @hddLine = split(' ',$_);
+            if(@hddLine[1]) {
+                push(@g_hddOverflow, @hddLine[1]);
+            }
+        }
+        elsif(/^end overflow/) {
+            return;
+        }
+    }
+}
+
+sub processFloatingIp {
+    while (<>) {
+        if (/^process/) {
+            @this_line = split(/;/, $_);
+            if($#this_line >= 2) {
+                @external_ip = split(/=/,@this_line[2]);
+#                print "external_ip @external_ip\n";
+                if (($#external_ip >= 1) && (@external_ip[0] eq "external-ip")) {
+                    $gFloatingExternalIp = @external_ip[1];
+                    $gFloatingExternalIp =~ s/\s+$//; # remove trailing spaces, including new-line characters
+#                    print "Floating External IP  $gFloatingExternalIp\n";
+                }
+            }
+        }
+        elsif(/^floating_ip_node_id/) {
+            @this_line = split(' ',$_);
+            if($#this_line > 0) {
+                $gFloatingNodeId=@this_line[1];
+                # Validate the node id
+                if (($gFloatingNodeId < 0) || ($gFloatingNodeId >= $gdNumNodes)) {
+                    print "Error: Invalid Floating IP Node Id provided. Please check your config file.\n";
+                    print "Exiting..\n";
+                    exit $BDR_ERROR;
+                }
+                $lv_bEdgeNodeFound = 0;
+                for ($lv_i = 0; $lv_i < $#g_EdgeNodes + 1 ; $lv_i++) {
+                    if (@g_EdgeNodes[$lv_i] == $gFloatingNodeId) {
+                        $lv_bEdgeNodeFound = 1;
+#                        print "$lv_i : @g_EdgeNodes[$lv_i] \n";
+                        break;
+                    }
+                }
+                if ($lv_bEdgeNodeFound == 0) {
+                    print "Error: Floating IP Node Id : $gFloatingNodeId is NOT an edge node. Please check your config file.\n";
+                    print "Exiting..\n";
+                    exit $BDR_ERROR;
+                }
+            }
+        }
+        elsif(/^floating_ip_failover_node_id/) {
+            @this_line = split(' ',$_);
+            if($#this_line > 0) {
+                $gFloatingFailoverNodeId=@this_line[1];
+                # print "Floating IP Failover Node Id = $gFloatingFailoverNodeId \n";
+                # Validate the node id
+                if (($gFloatingFailoverNodeId < 0) || ($gFloatingFailoverNodeId >= $gdNumNodes)) {
+                    print "Error: Invalid Floating IP Failover Node Id provided. Please check your config file.\n";
+                    print "Exiting..\n";
+                    exit $BDR_ERROR;
+                }
+                $lv_bEdgeNodeFound = 0;
+                for ($lv_i = 0; $lv_i < $#g_EdgeNodes + 1 ; $lv_i++) {
+                    if (@g_EdgeNodes[$lv_i] == $gFloatingFailoverNodeId) {
+                        $lv_bEdgeNodeFound = 1;
+#                        print "$lv_i : @g_EdgeNodes[$lv_i] \n";
+                        break;
+                    }
+                }
+                if ($lv_bEdgeNodeFound == 0) {
+                    print "Error: Floating IP  Failover Node Id : $gFloatingFailoverNodeId is NOT an edge node. Please check your config file.\n";
+                    print "Exiting..\n";
+                    exit $BDR_ERROR;
+                }
+            }
+        }
+        elsif(/^end floating_ip/) {
+#            printf "Floating Node Id : $gFloatingNodeId \n";
+            if ($gFloatingNodeId == -1) {
+                print "Error: floating_ip_node_id not provided. Please check your config file.\n";
+                print "Exiting..\n";
+                exit $BDR_ERROR;
+            }
+            if ($gFloatingExternalIp eq "") {
+                print "Error: bdr_ip_address is not provided, Please check your config file.\n";
+                print "Exiting..\n";
+                exit $BDR_ERROR;
+            }
+            return;
+        }
+    }
+}
+
+sub printInitLinesAuxFiles {
+
+    my $file_ptr  = @_[0];
+
+    print $file_ptr "#!/bin/bash\n";
+    print $file_ptr "# Trafodion config/utility file generated @ ", getTime(), "\n";
+}
+
+sub openFiles {
+
+#    open (SQS,">$coldscriptFileName")
+#        or die("unable to open $coldscriptFileName");
+
+#    open (IDTM,">$startIDTM")
+#        or die("unable to open $startIDTM");
+
+#    open (CMON,">$startCMON")
+#        or die("unable to open $startCMON");
+
+#    open (NMON,">$startNMON")
+#        or die("unable to open $startNMON");
+
+#    open (TNOTIFY,">$startTNOTIFY")
+#        or die("unable to open $startTNOTIFY");
+
+#    open (TM,">$startTM")
+#        or die("unable to open $startTM");
+
+#    open (RMS,">$startRMS")
+#        or die("unable to open $startRMS");
+
+#    open (RMSS,">$stopRMS")
+#        or die("unable to open $stopRMS");
+
+#    open (SSMP,">$startSSMP")
+#        or die("unable to open $startSSMP");
+
+#    open (SSMPS,">$stopSSMP")
+#        or die("unable to open $stopSSMP");
+
+#    open (SSCP,">$startSSCP")
+#        or die("unable to open $startSSCP");
+
+#    open (SSCPS,">$stopSSCP")
+#        or die("unable to open $stopSSCP");
+
+    sqconfigdb::openDb();
+}
+
+sub endGame {
+
+    if ($bVirtualNodes == 1) {
+        open (SQSH,">$ENV{'TRAF_VAR'}/$sqshell")
+            or die("unable to open $sqshell");
+        printInitLinesAuxFiles (SQSH);
+
+        print SQSH "export SQ_VIRTUAL_NODES=$gdNumNodes\n";
+        print SQSH "export SQ_VIRTUAL_NID=0\n";
+
+        close(SQSH);
+
+        print "\nGenerated SQ Shell environment file: $sqshell\n";
+
+        chmod 0700, "$ENV{'TRAF_VAR'}/$sqshell";
+    }
+#    print SQSH "\nshell \$1 \$2 \$3 \$4 \$5 \$6 \$7 \$8 \$9\n";
+
+
+    print "\n";
+#    print "Generated SQ startup script file: $coldscriptFileName\n";
+#    print "Generated IDTM Startup      file: $startIDTM\n";
+#    print "Generated TM Startup        file: $startTM\n";
+#    print "Generated RMS Startup       file: $startRMS\n";
+#    print "Generated RMS Stop          file: $stopRMS\n";
+#    print "Generated SSMP Startup      file: $startSSMP\n";
+#    print "Generated SSMP Stop         file: $stopSSMP\n";
+#    print "Generated SSCP Startup      file: $startSSCP\n";
+#    print "Generated SSCP Stop         file: $stopSSCP\n";
+#    print "Generated CMON Startup      file: $startCMON\n";
+#    print "Generated NMON Startup      file: $startNMON\n";
+#    print "Generated TNOTIFY Startup   file: $startTNOTIFY\n";
+
+#    close(SQS);
+#    close(DBZ);
+
+#    close(RMS);
+#    close(RMSS);
+
+#    close(SSMP);
+#    close(SSMPS);
+
+#    close(SSCP);
+#    close(SSCPS);
+
+#    close(CMON);
+#    close(NMON);
+#    close(TNOTIFY);
+
+#    chmod 0700, $coldscriptFileName;
+
+
+#    chmod 0700, $startIDTM;
+#    chmod 0700, $startTM;
+
+#    chmod 0700, $startRMS;
+#    chmod 0700, $stopRMS;
+
+#    chmod 0700, $startSSMP;
+#    chmod 0700, $stopSSMP;
+
+#    chmod 0700, $startSSCP;
+#    chmod 0700, $stopSSCP;
+
+#    chmod 0700, $startCMON;
+#    chmod 0700, $startNMON;
+#    chmod 0700, $startTNOTIFY;
+}
+
+sub doInit {
+
+    $scriptFileName= shift(@ARGV);
+    $g_HostName= shift(@ARGV);
+    $g_FTFlag= shift(@ARGV);
+    $g_PERFFlag= shift(@ARGV);
+    # remaining args are input files
+
+
+#    $startIDTM="idtmstart";
+#    $startTM="tmstart";
+
+#    $startRMS="rmsstart";
+#    $startSSMP="ssmpstart";
+#    $startSSCP="sscpstart";
+#    $stopRMS="rmsstop";
+#    $stopSSMP="ssmpstop";
+#    $stopSSCP="sscpstop";
+
+#    $startCMON="cmonstart";
+#    $startNMON="nmonstart";
+#    $startTNOTIFY="tnotifystart";
+
+#    $coldscriptFileName=sprintf("%s.cold", $scriptFileName);
+
+    $sqshell = "monshell.env";
+
+    $gdNumCpuCores = `cat /proc/cpuinfo | grep "processor" | wc -l`;
+#print "The number of cores is $gdNumCpuCores\n";
+
+}
+
+#
+# Main
+#
+
+doInit();
+
+openFiles;
+
+
+while (<>) {
+    if (/^begin node/) {
+        processNodes;
+        printInitialLines;
+    }
+    elsif (/^begin notify_email/) {
+        processNotifyEmail;
+        printNotifyEmailLines;
+    }
+    elsif (/^begin overflow/) {
+        processOverflow;
+        printOverflowLines;
+    }
+    elsif (/^begin floating_ip/) {
+        processFloatingIp;
+    }
+    elsif (/^begin persist/) {
+        processPersist;
+    }
+    elsif (/^begin name-server/) {
+        processNameserver;
+    }
+    elsif (/^%/) {
+        printSQShellCommand;
+    }
+    elsif (/^!/) {
+        executeShellCommand;
+    }
+    else {
+        if (/^#/) {
+        }
+        elsif (/^\s*$/) {
+        }
+        else {
+            print "invalid line:", $_;
+            exit 1
+        }
+    }
+}
+
+#printZoneList;
+
+printScriptEndLines;
+
+endGame;
diff --git a/core/sqf/monitor/test/montestutil.cxx b/core/sqf/monitor/test/montestutil.cxx
index 43e16c9..7e0f2e1 100644
--- a/core/sqf/monitor/test/montestutil.cxx
+++ b/core/sqf/monitor/test/montestutil.cxx
@@ -74,9 +74,6 @@
         case MsgType_NodeJoining:
             str = "MsgType_NodeJoining";
             break;
-        case MsgType_NodePrepare:
-            str = "MsgType_NodePrepare";
-            break;
         case MsgType_NodeQuiesce:
             str = "MsgType_NodeQuiesce";
             break;
@@ -101,18 +98,6 @@
         case MsgType_SpareUp:
             str = "MsgType_SpareUp";
             break;
-        case MsgType_TmRestarted:
-            str = "MsgType_TmRestarted";
-            break;
-        case MsgType_TmSyncAbort:
-            str = "MsgType_TmSyncAbort";
-            break;
-        case MsgType_TmSyncCommit:
-            str = "MsgType_TmSyncCommit";
-            break;
-        case MsgType_UnsolicitedMessage:
-            str = "MsgType_UnsolicitedMessage";
-            break;
         default:
             str = "MsgType - Undefined";
             break;
@@ -121,6 +106,50 @@
 }
 
 
+const char *StateString( int state)
+{
+    const char *str;
+    
+    switch( state )
+    {
+        case State_Unknown:
+            str = "State_Unknown";
+            break;
+        case State_Up:
+            str = "State_Up";
+            break;
+        case State_Down:
+            str = "State_Down";
+            break;
+        case State_Stopped:
+            str = "State_Stopped";
+            break;
+        case State_Shutdown:
+            str = "State_Shutdown";
+            break;
+        case State_Unlinked:
+            str = "State_Unlinked";
+            break;
+        case State_Merging:
+            str = "State_Merging";
+            break;
+        case State_Merged:
+            str = "State_Merged";
+            break;
+        case State_Joining:
+            str = "State_Joining";
+            break;
+        case State_Initializing:
+            str = "State_Initializing";
+            break;
+        default:
+            str = "State - Undefined";
+            break;
+    }
+
+    return( str );
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // Monitor test utility
 //////////////////////////////////////////////////////////////////////////////
@@ -505,10 +534,8 @@
         int             us;
         struct timeval  t;
         struct tm       tx;
-        struct tm      *txp;
         char            buf[BUFSIZ];
         gettimeofday(&t, NULL);
-        txp = localtime_r(&t.tv_sec, &tx);
         ms = (int) t.tv_usec / 1000;
         us = (int) t.tv_usec - ms * 1000;
 
@@ -568,7 +595,7 @@
         char tracefile[MAX_SEARCH_PATH];
         char *tmpDir;
 
-        tmpDir = getenv( "XMPI_TMPDIR" );
+        tmpDir = getenv( "TRAF_LOG" );
         if (tmpDir)
         {
             sprintf( tracefile, "%s/shell.trace.%d", tmpDir, getpid() );
@@ -931,11 +958,10 @@
                 {
                     if ( trace_ )
                     {
-                        printf( "[%s] - process info for %s returned data for %d processes, expected=%d\n"
+                        printf( "[%s] - process info for %s returned data for %d processes\n"
                                 , processName_
                                 , processName
-                                , msg->u.reply.u.process_info.num_processes
-                                , 1);
+                                , msg->u.reply.u.process_info.num_processes );
                     }
                 }
             }
@@ -1469,8 +1495,6 @@
 
 bool MonTestUtil::requestNodeDown( int nid )
 {
-    int count;
-    MPI_Status status;
     struct message_def *msg;
     bool result = false;
 
@@ -1493,8 +1517,6 @@
     msg->u.request.u.down.node_name[0] = '\0';
 
     gp_local_mon_io->send( msg );
-    count = sizeof (*msg);
-    status.MPI_TAG = msg->reply_tag;
 
     return result;
 }
diff --git a/core/sqf/monitor/test/montestutil.h b/core/sqf/monitor/test/montestutil.h
old mode 100755
new mode 100644
index 594b367..fca4af6
--- a/core/sqf/monitor/test/montestutil.h
+++ b/core/sqf/monitor/test/montestutil.h
@@ -1,6 +1,6 @@
 ///////////////////////////////////////////////////////////////////////////////
 //
-// @@@ START COPYRIGHT @@@
+// @@@@@@ START COPYRIGHT @@@@@@
 //
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
@@ -19,7 +19,7 @@
 // specific language governing permissions and limitations
 // under the License.
 //
-// @@@ END COPYRIGHT @@@
+// @@@@@@ END COPYRIGHT @@@@@@
 //
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -154,4 +154,7 @@
 
 const char *MessageTypeString( MSGTYPE type );
 
+const char *StateString( int state);
+
 #endif
+
diff --git a/core/sqf/monitor/test/persistentProc.cxx b/core/sqf/monitor/test/persistentProc.cxx
old mode 100755
new mode 100644
index b9edd47..904ca23
--- a/core/sqf/monitor/test/persistentProc.cxx
+++ b/core/sqf/monitor/test/persistentProc.cxx
@@ -27,6 +27,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 
 #include "clio.h"
 #include "sqevlog/evl_sqlog_writer.h"
@@ -90,41 +91,26 @@
     char procName[25];
     char *serverArgs[1] = {(char *) "-t"};
     enum { TEST_FAILED=0 };
-    char value[25];
-
-    // Set persistent process registry values for process $ABC
-    const int persistNode1 = 2;
-    const int persistNode2 = 4;
-    printf("[%s] For process $ABC setting PERSIST_ZONES=%d,%d\n",
-            MyName, persistNode1, persistNode2);
-
-    sprintf(value, "%d,%d", persistNode1, persistNode2);
-    if (!util.requestSet ( ConfigType_Process, "$ABC", "PERSIST_ZONES",
-                           value))
-    {
-        return TEST_FAILED;
-    }
-
-    int maxRetries = 1;
-    int retryResetTime = 10;
-    printf("[%s] For process $ABC setting PERSIST_RETRIES=%d,%d\n",
-            MyName, maxRetries, retryResetTime);
-
-    sprintf(value, "%d,%d", maxRetries, retryResetTime);
-    // Set count of times to restart and persistent "max time".
-    if (!util.requestSet ( ConfigType_Process, "$ABC", "PERSIST_RETRIES",
-                           value))
-    {
-        return TEST_FAILED;
-    }
+    const int persistNode1 = 1;
+    const int persistNode2 = 2;
 
     // Start the server process
-    if (!util.requestNewProcess ( persistNode1, ProcessType_Generic, false,
-                                  "$ABC", "server", "", "",
-                                  ((tracing) ? 1: 0), serverArgs,
-                                  procNid, procPid, procVerifier, procName))
+    if (!util.requestNewProcess( persistNode1
+                               , ProcessType_PERSIST
+                               , false
+                               , "$ABC"
+                               , "server"
+                               , ""
+                               , ""
+                               , ((tracing) ? 1: 0)
+                               , serverArgs
+                               , procNid
+                               , procPid
+                               , procVerifier
+                               , procName) )
     {
-        return TEST_FAILED;
+        testSuccess = false;
+        return testSuccess;
     }
 
     // Allow time for process creation
@@ -142,7 +128,8 @@
             printf ("[%s] process $ABC (%d, %d:%d) is not running on node %d "
                     "as expected.\n", MyName, statNid1, statPid1,
                     statVerifier1, persistNode1);
-            return TEST_FAILED;
+            testSuccess = false;
+            return testSuccess;
         }
         else
         {
@@ -154,7 +141,8 @@
     {
         printf ("[%s] Started persisten process $ABC but unable to get "
                 "process info for it.\n", MyName);
-        return TEST_FAILED;
+        testSuccess = false;
+        return testSuccess;
     }
 
     printf ("[%s] Killing process $ABC\n", MyName);
@@ -176,13 +164,15 @@
             printf ("[%s] process $ABC (%d, %d:%d) is not running on node %d "
                     "as expected.\n", MyName, statNid2, statPid2,
                     statVerifier2, persistNode1);
-            return TEST_FAILED;
+            testSuccess = false;
+            return testSuccess;
         }
         if ( statPid2 == statPid1 )
         {
             printf ("[%s] process $ABC apparently not restarted, old pid is "
                      "the same as the current pid (%d)\n", MyName, statPid2);
-            return TEST_FAILED;
+            testSuccess = false;
+            return testSuccess;
         }
         else
         {
@@ -194,7 +184,8 @@
     else
     {
         printf ("[%s] Unable to get process info for $ABC\n", MyName);
-        return TEST_FAILED;
+        testSuccess = false;
+        return testSuccess;
     }
 
     printf ("[%s] Killing process $ABC\n", MyName);
@@ -210,7 +201,8 @@
     {
         printf("[%s] Unexpectedly got $ABC (%d, %d:%d) process status\n",
                MyName, statNid2, statPid2, statVerifier2);
-        return TEST_FAILED;
+        testSuccess = false;
+        return testSuccess;
     }
     else
     {
@@ -219,12 +211,22 @@
     }
 
     // Start the server process
-    if (!util.requestNewProcess ( persistNode1, ProcessType_Generic, false,
-                                  "$ABC", "server", "", "",
-                                  ((tracing) ? 1: 0), serverArgs,
-                                  procNid, procPid, procVerifier, procName))
+    if (!util.requestNewProcess( persistNode1
+                               , ProcessType_PERSIST
+                               , false
+                               , "$ABC"
+                               , "server"
+                               , ""
+                               , ""
+                               , ((tracing) ? 1: 0)
+                               , serverArgs
+                               , procNid
+                               , procPid
+                               , procVerifier
+                               , procName) )
     {
-        return TEST_FAILED;
+        testSuccess = false;
+        return testSuccess;
     }
 
     // Allow time for process creation
@@ -249,7 +251,8 @@
     else
     {
         printf ("[%s] Unable to get process info for $ABC\n", MyName);
-        return TEST_FAILED;
+        testSuccess = false;
+        return testSuccess;
     }
 
     printf ("[%s] Downing node %d\n", MyName, persistNode1);
@@ -263,29 +266,41 @@
     {
         if ( util.requestNodeInfo ( -1, false, -1, -1, nodeData ) )
         {  // Got node data
-            if ( nodeData->node[2].state == 2 )
+            if ( nodeData->node[persistNode1].state == State_Down )
             {
-                printf ("[%s] Process status for node 2 is DOWN.\n",  MyName);
+                printf( "[%s] Node status for node %d is %s.\n"
+                      , MyName
+                      , persistNode1
+                      , StateString(nodeData->node[persistNode1].state) );
                 break;
             }
             else
             {
-                printf ("[%s] Process status for node 2 is %d.\n",  MyName,
-                         nodeData->node[2].state);
+                printf("[%s] Node status for node %d is %s, expecting %s\n"
+                      , MyName
+                      , persistNode1
+                      , StateString(nodeData->node[persistNode1].state)
+                      , StateString(State_Down) );
             }
         }
         else
         {   // Failed to get node data
             printf ("[%s] Unable to get node info\n", MyName);
-            return TEST_FAILED;
+            testSuccess = false;
+            return testSuccess;
         }
         sleep(2);
     }
-    if ( nodeData->node[2].state != 2 )
+    if ( nodeData->node[persistNode1].state != State_Down )
     {
-        printf ("[%s] After downing node 2, node state=%d but expected "
-                "state=2\n", MyName, nodeData->node[2].state);
-        return TEST_FAILED;
+        printf ("[%s] After downing node %d, node state=%s but expected "
+                "state=%s\n"
+                , MyName
+                , persistNode1
+                , StateString(nodeData->node[persistNode1].state)
+                , StateString(State_Down) );
+        testSuccess = false;
+        return testSuccess;
     }
     
     // Verify process was restarted on new node
@@ -293,22 +308,31 @@
     {
         if ( statNid1 != persistNode2 )
         {
-            printf ("[%s] process $ABC (%d, %d:%d) is not running on node %d "
-                    "as expected.\n", MyName, statNid1, statPid1,
-                    statVerifier1, persistNode2);
+            printf( "[%s] process $ABC (%d, %d:%d) is not running on node %d "
+                    "as expected.\n"
+                   , MyName
+                   , statNid1
+                   , statPid1
+                   , statVerifier1
+                   , persistNode2 );
             testSuccess = false;
+            return testSuccess;
         }
         else
         {
-            printf ("[%s] Persistent process $ABC (%d, %d:%d) was restarted as "
-                    "expected.\n",
-                    MyName, statNid1, statPid1, statVerifier1);
+            printf( "[%s] Persistent process $ABC (%d, %d:%d) was restarted as "
+                    "expected.\n"
+                  , MyName
+                  , statNid1
+                  , statPid1
+                  , statVerifier1);
         }
     }
     else
     {
         printf ("[%s] Unable to get process info for $ABC\n", MyName);
-        return TEST_FAILED;
+        testSuccess = false;
+        return testSuccess;
     }
 
     printf ("[%s] Killing process $ABC\n", MyName);
@@ -323,14 +347,18 @@
     // Verify process not restarted
     if (util.requestProcInfo( "$ABC", statNid2, statPid2, statVerifier2 ))
     {
-        printf("[%s] Unexpectedly got $ABC (%d, %d:%d) process status\n",
-               MyName, statNid2, statPid2, statVerifier2);
+        printf( "[%s] Persistent process $ABC (%d, %d:%d) restart was not "
+                "expected.\n"
+              , MyName
+              , statNid2
+              , statPid2
+              , statVerifier2);
         testSuccess = false;
+        return testSuccess;
     }
     else
     {
-        printf("[%s] Confirmed: process $ABC was not restarted.\n",
-               MyName);
+        printf( "[%s] Confirmed: process $ABC was not restarted.\n",  MyName);
     }
 
     return testSuccess;
diff --git a/core/sqf/monitor/test/persistentProc.sub b/core/sqf/monitor/test/persistentProc.sub
old mode 100755
new mode 100644
diff --git a/core/sqf/monitor/test/procCreate.cxx b/core/sqf/monitor/test/procCreate.cxx
index 8247bd4..90b11b4 100644
--- a/core/sqf/monitor/test/procCreate.cxx
+++ b/core/sqf/monitor/test/procCreate.cxx
@@ -31,6 +31,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 
 #include "clio.h"
 #include "sqevlog/evl_sqlog_writer.h"
@@ -220,6 +221,15 @@
         }
 
     }
+    else if ( recv_msg->type == MsgType_Change ) 
+    {
+        if (tracing) 
+            printf("[%s] Message Type Change: ConfigType=%d GroupName=%s Key=%s Value=%s\n", MyName, 
+                   recv_msg->u.request.u.change.type,
+                   recv_msg->u.request.u.change.group,
+                   recv_msg->u.request.u.change.key,
+                   recv_msg->u.request.u.change.value );
+    }
     else {
         printf("[%s] unexpected notice, type=%s\n", MyName,
                MessageTypeString( recv_msg->type));
diff --git a/core/sqf/monitor/test/regTestCtrl.cxx b/core/sqf/monitor/test/regTestCtrl.cxx
old mode 100755
new mode 100644
diff --git a/core/sqf/monitor/test/runtest b/core/sqf/monitor/test/runtest
index 5580d0a..d00f2cc 100755
--- a/core/sqf/monitor/test/runtest
+++ b/core/sqf/monitor/test/runtest
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 #
 # @@@ START COPYRIGHT @@@
 #
@@ -26,11 +26,11 @@
 # Cluster environment setup:
 #    - create directories
 #      cd $TRAF_HOME
-#      pdsh $MY_NODES mkdir $PWD/monitor
-#      pdsh $MY_NODES mkdir $PWD/monitor/test
+#      edb_pdsh -a mkdir $PWD/monitor
+#      edb_pdsh -a mkdir $PWD/monitor/test
 #    - copy test files to $TRAF_HOME/monitor/test in all nodes
 #      cd $TRAF_HOME/monitor/test
-#      pdcp $MY_NODES -p -r /home/sqft2/monitor/test/* $PWD/
+#      pdcp $(trafconf -wname) -p -r /home/sqft2/monitor/test/* $PWD/
 #    - execute runtest -cluster
 #      cd $TRAF_HOME/monitor/test
 #      runtest -cluster
@@ -51,19 +51,17 @@
     echo "         2     - Multi-Node"
     echo "         3     - Registry"
     echo "         4     - Death Notice"
-    echo "         5     - Persistent Process (FAILS-TBD)"
-    echo "         6     - DTM Process (FAILS-TBD)"
-    echo "         7     - SPX Process"
-    echo "         8     - Process Create"
-    echo "         9     - Node down before startup"
-    echo "        10     - TmSync"
+    echo "         5     - Persistent Process"
+    echo "         6     - DTM Process"
+    echo "         7     - Process Create"
+    echo "         8     - Node down before startup"
     echo
 }
 
 function Cleanup
 {
-    ckillall
-    sqipcrm
+    monpkillall
+    sqipcrm -local
 }
 
 if [ $# -lt 1 ]; then
@@ -73,6 +71,7 @@
 
 cluster=
 conf=
+conf_persist=
 virtual=
 trace=
 nogen=
@@ -84,11 +83,13 @@
 if [ "$1" = "-cluster" ]; then
     cluster=-cluster
     conf="sqconfig.monitor.cluster"
+    conf_persist="sqconfig.persist"
     shift
 fi
 if [ "$1" = "-virtual" ]; then
     virtual=-virtual
     conf="sqconfig.monitor.virtual"
+    conf_persist="sqconfig.persist"
     shift
 fi
 if [ "$1" = "-nogen" ]; then
@@ -103,6 +104,12 @@
     shift
     test=${1}
     shift
+    if ( [ $test '<' 1 ] || [ $test '>' 8 ] ); then
+        echo
+        echo "Invalid test number (-test ${test})"
+        Usage
+        exit 1
+    fi
 fi
 
 #
@@ -110,6 +117,8 @@
 #
 ARCH=`arch`
 export PATH=$PATH:$PWD/Linux-${ARCH}/64/dbg
+export TRAF_CLUSTER_ID=1
+export TRAF_INSTANCE_ID=1
 cd $TRAF_HOME/monitor/test
 echo $PWD
 
@@ -124,8 +133,8 @@
 fi
 
 if [ "$cluster" = "-cluster" ]; then
-    pdsh $MY_NODES "rm -f $MPI_TMPDIR/monitor.port.*"
-    pdsh $MY_NODES "rm $PWD/core* $PWD/*.lst $PWD/stdout_*"
+    edb_pdsh -a "rm -f $MPI_TMPDIR/monitor.port.*"
+    edb_pdsh -a "rm $PWD/core* $PWD/*.lst $PWD/stdout_*"
 else
     rm -f $MPI_TMPDIR/monitor.port.*
     rm core* *.lst stdout_*
@@ -146,47 +155,95 @@
 # Uncomment to enable LIO tracing in test programs
 #export SQ_LOCAL_IO_SHELL_TRACE=1
 
+genConf_Success=0
+genConf_Failure=255
+
+function Genconf() 
+{
 #
 # Setup up monitor configuration
 #
-if [ "$nogen" = "-nogen" ]; then
-    echo "sqgen not executed!"
-else
-    echo sqgen $TRAF_HOME/monitor/test/$conf
-    sqgen $TRAF_HOME/monitor/test/$conf
-fi
+    [ $# -eq 1 ] || return $genConf_Failure
+
+    case $1 in
+    *[!0-9]*|"") 
+               return $genConf_Failure ;;
+
+    [1-5,7-8]) 
+             conf_persist=sqconfig.persist  ;;
+             
+    [6])      
+       conf_persist=sqconfig.persist.dtm  ;;
+      
+    esac
+
+    echo "**********************************************************"
+    echo ./montestgen $TRAF_HOME/monitor/test/$conf $TRAF_HOME/monitor/test/$conf_persist
+    echo "**********************************************************"
+    #./montestgen $TRAF_HOME/monitor/test/$conf $TRAF_HOME/monitor/test/$conf_persist
+    
+    ./montestgen $TRAF_HOME/monitor/test/$conf $TRAF_HOME/monitor/test/$conf_persist
+    return $genConf_Success 
+}
+
+function Shutdown_status() 
+{
+#
+# Verify if shutdown stopped the required processes
+#
+    ret=$(monpstat -h|wc -l)
+    if [ $ret == "0" ] ; then
+        echo "${*} Shutdown: PASSED"
+    else
+        echo "${*} Shutdown: FAILED"
+    fi
+}
+
 
 #
 # Execute tests
 #
 if ( [ $test '==' -1 ] || [ $test '==' 1 ] ); then
- echo "***"
- echo "*** Executing Child Exit test"
- echo "***"
 Cleanup
-shell <<eof
+Genconf 1
+[ $? -eq 0 ] || return 101
+
+TestTitle="Child Exit Test"
+ echo "***"
+ echo "*** Executing: ${TestTitle}"
+ echo "***"
+monshell <<eof
  startup
  delay 3
+ ps
  exec {name \$CTRLR, nid 0, out $TRAF_HOME/monitor/test/childExit.lst} childExitCtrl $trace
  delay 3
- !shutdown
+ persist info PP 
+ persist kill PP
+ persist info ABC 
+ persist kill ABC
  exit
 eof
-shell -nid 0 -c ps
-#shell -nid 0 -c ps;shell -nid 1 -c ps;shell -nid 2 -c ps;shell -nid 3 -c ps;shell -nid 4 -c ps;shell -nid 5 -c ps
-shell -nid 0 -c ps monitor;shell -nid 1 -c ps monitor;shell -nid 2 -c ps monitor;shell -nid 3 -c ps monitor;shell -nid 4 -c ps monitor;shell -nid 5 -c ps monitor
-shell -c ps monitor
-if ( [ $test '==' -1 ] ); then
-shell -a<<eof
+
+#monshell -nid 0 -c ps
+#monshell -nid 0 -c ps monitor
+#monshell -nid 0 -c ps;monshell -nid 1 -c ps;monshell -nid 2 -c ps;monshell -nid 3 -c ps;monshell -nid 4 -c ps;monshell -nid 5 -c ps
+#monshell -nid 0 -c ps monitor;monshell -nid 1 -c ps monitor;monshell -nid 2 -c ps monitor;monshell -nid 3 -c ps monitor;monshell -nid 4 -c ps monitor;monshell -nid 5 -c ps monitor
+
+monshell -a<<eof
+ ps
+ ps monitor
  shutdown
  exit
 eof
-fi
+
  sleep 5
+ monpstat
+ Shutdown_status ${TestTitle} >> $PWD/childExit.lst
  cstat -h
  if [ "$cluster" = "-cluster" ]; then
-     pdsh $MY_NODES grep PASSED $PWD/childExit.lst
-     pdsh $MY_NODES grep FAILED $PWD/childExit.lst
+     edb_pdsh -a grep PASSED $PWD/childExit.lst
+     edb_pdsh -a grep FAILED $PWD/childExit.lst
  else
      grep PASSED $PWD/childExit.lst
      grep FAILED $PWD/childExit.lst
@@ -194,11 +251,15 @@
 fi
 
 if ( [ $test '==' -1 ] || [ $test '==' 2 ] ); then
- echo "***"
- echo "*** Executing Multi-Node test"
- echo "***"
 Cleanup
-shell <<eof
+Genconf 2
+[ $? -eq 0 ] || return 101
+
+TestTitle="Multi-Node test"
+ echo "***"
+ echo "*** Executing: ${TestTitle}"
+ echo "***"
+monshell <<eof
  startup
  delay 3
  ! General tests for monitor functionality. This includes
@@ -211,54 +272,70 @@
  ps
  exec {pri 10,name \$CLIENT,nid 0, out $TRAF_HOME/monitor/test/multiNode.lst} client $trace
  delay 3
- !shutdown
  exit
 eof
-shell -c ps 
-shell -c ps monitor
-if ( [ $test '==' -1 ] ); then
-shell -a<<eof
+
+#monshell -nid 0 -c ps
+#monshell -nid 0 -c ps monitor
+#monshell -nid 0 -c ps;monshell -nid 1 -c ps;monshell -nid 2 -c ps;monshell -nid 3 -c ps;monshell -nid 4 -c ps;monshell -nid 5 -c ps
+#monshell -nid 0 -c ps monitor;monshell -nid 1 -c ps monitor;monshell -nid 2 -c ps monitor;monshell -nid 3 -c ps monitor;monshell -nid 4 -c ps monitor;monshell -nid 5 -c ps monitor
+
+monshell -a<<eof
+ ps
+ ps monitor
  shutdown
  exit
 eof
-fi
+
  sleep 5
+ monpstat
+ Shutdown_status ${TestTitle} >> $PWD/multiNode.lst
  cstat -h
  if [ "$cluster" = "-cluster" ]; then
-     pdsh $MY_NODES grep PASSED $PWD/multiNode.lst
-     pdsh $MY_NODES grep FAILED $PWD/multiNode.lst
+     edb_pdsh -a grep PASSED $PWD/multiNode.lst
+     edb_pdsh -a grep FAILED $PWD/multiNode.lst
  else
      grep PASSED $PWD/multiNode.lst
      grep FAILED $PWD/multiNode.lst
  fi
 fi
 
- if ( [ $test '==' -1 ] || [ $test '==' 3 ] ); then
- echo "***"
- echo "*** Executing Registry test"
- echo "***"
+if ( [ $test '==' -1 ] || [ $test '==' 3 ] ); then
 Cleanup
-shell <<eof
+Genconf 3 
+[ $? -eq 0 ] || return 101
+
+TestTitle="Registry Test"
+ echo "***"
+ echo "*** Executing: ${TestTitle}"
+ echo "***"
+monshell <<eof
  startup
  delay 3
  exec {name \$CTRLR, nid 0, out $TRAF_HOME/monitor/test/regTest.lst} regTestCtrl $trace
  delay 3
- !shutdown
  exit
 eof
-shell -c ps 
-shell -c ps monitor
-if ( [ $test '==' -1 ] ); then
-shell -a<<eof
+
+#monshell -nid 0 -c ps
+#monshell -nid 0 -c ps monitor
+#monshell -nid 0 -c ps;monshell -nid 1 -c ps;monshell -nid 2 -c ps;monshell -nid 3 -c ps;monshell -nid 4 -c ps;monshell -nid 5 -c ps
+#monshell -nid 0 -c ps monitor;monshell -nid 1 -c ps monitor;monshell -nid 2 -c ps monitor;monshell -nid 3 -c ps monitor;monshell -nid 4 -c ps monitor;monshell -nid 5 -c ps monitor
+
+monshell -a<<eof
+ ps
+ ps monitor
  shutdown
  exit
 eof
-fi
+
  sleep 5
+ monpstat
+ Shutdown_status ${TestTitle} >> $PWD/regTest.lst
  cstat -h
  if [ "$cluster" = "-cluster" ]; then
-     pdsh $MY_NODES grep PASSED $PWD/regTest.lst
-     pdsh $MY_NODES grep FAILED $PWD/regTest.lst
+     edb_pdsh -a grep PASSED $PWD/regTest.lst
+     edb_pdsh -a grep FAILED $PWD/regTest.lst
  else
      grep PASSED $PWD/regTest.lst
      grep FAILED $PWD/regTest.lst
@@ -266,11 +343,15 @@
 fi
 
 if ( [ $test '==' -1 ] || [ $test '==' 4 ] ); then
- echo "***"
- echo "*** Executing Death Notice test"
- echo "***"
 Cleanup
-shell <<eof
+Genconf 4
+[ $? -eq 0 ] || return 101
+
+TestTitle="Death Notice Test"
+ echo "***"
+ echo "*** Executing: ${TestTitle}"
+ echo "***"
+monshell <<eof
  startup
  delay 3
  ! Test to register and cancel process death
@@ -286,63 +367,82 @@
  !
  exec {name \$DEATH, nid 0, out $TRAF_HOME/monitor/test/deathNotice.lst} deathNotice $trace
  delay 3
- !shutdown
  exit
 eof
-shell -c ps 
-shell -c ps monitor
-if ( [ $test '==' -1 ] ); then
-shell -a<<eof
+
+#monshell -nid 0 -c ps
+#monshell -nid 0 -c ps monitor
+#monshell -nid 0 -c ps;monshell -nid 1 -c ps;monshell -nid 2 -c ps;monshell -nid 3 -c ps;monshell -nid 4 -c ps;monshell -nid 5 -c ps
+#monshell -nid 0 -c ps monitor;monshell -nid 1 -c ps monitor;monshell -nid 2 -c ps monitor;monshell -nid 3 -c ps monitor;monshell -nid 4 -c ps monitor;monshell -nid 5 -c ps monitor
+
+monshell -a<<eof
+ ps
+ ps monitor
  shutdown
  exit
 eof
-fi
+
  sleep 5
+ monpstat
+ Shutdown_status ${TestTitle} >> $PWD/deathNotice.lst
  cstat -h
  if [ "$cluster" = "-cluster" ]; then
-     pdsh $MY_NODES grep PASSED $PWD/deathNotice.lst
-     pdsh $MY_NODES grep FAILED $PWD/deathNotice.lst
+     edb_pdsh -a grep PASSED $PWD/deathNotice.lst
+     edb_pdsh -a grep FAILED $PWD/deathNotice.lst
  else
      grep PASSED $PWD/deathNotice.lst
      grep FAILED $PWD/deathNotice.lst
  fi
 fi
 
-#if ( [ $test '==' -1 ] || [ $test '==' 5 ] ); then
-if ( [ $test '==' 5 ] ); then
+if ( [ $test '==' -1 ] || [ $test '==' 5 ] ); then
  if [ "$cluster" = "-cluster" ]; then
-  echo "***"
-  echo "*** Executing Persistent Process cluster test"
-  echo "***"
 Cleanup
-shell <<eof
+Genconf 5
+[ $? -eq 0 ] || return 101
+
+TestTitle="Persistent Process CLUSTER Test"
+ echo "***"
+ echo "*** Executing: ${TestTitle}"
+ echo "***"
+monshell <<eof
  startup
  delay 3
  down 2 !
  delay 15
  exec {name \$PPROC, nid 0, out $TRAF_HOME/monitor/test/persistentProc.lst} persistentProc $trace
  delay 3
- !shutdown
  exit
 eof
-shell -c ps 
-shell -c ps monitor
-if ( [ $test '==' -1 ] ); then
-shell -a<<eof
+
+#monshell -nid 0 -c ps
+#monshell -nid 0 -c ps monitor
+#monshell -nid 0 -c ps;monshell -nid 1 -c ps;monshell -nid 2 -c ps;monshell -nid 3 -c ps;monshell -nid 4 -c ps;monshell -nid 5 -c ps
+#monshell -nid 0 -c ps monitor;monshell -nid 1 -c ps monitor;monshell -nid 2 -c ps monitor;monshell -nid 3 -c ps monitor;monshell -nid 4 -c ps monitor;monshell -nid 5 -c ps monitor
+
+monshell -a<<eof
+ ps
+ ps monitor
  shutdown
  exit
 eof
-fi
+
   sleep 5
+  monpstat
+  Shutdown_status ${TestTitle} >> $PWD/persistentProc.lst
   cstat -h
-  pdsh $MY_NODES grep PASSED $PWD/persistentProc.lst
-  pdsh $MY_NODES grep FAILED $PWD/persistentProc.lst
+  edb_pdsh -a grep PASSED $PWD/persistentProc.lst
+  edb_pdsh -a grep FAILED $PWD/persistentProc.lst
+
  else
-  echo "***"
-  echo "*** Executing Persistent Process test"
-  echo "***"
+
 Cleanup
-shell <<eof
+Genconf 5
+TestTitle="Persistent Process Test"
+ echo "***"
+ echo "*** Executing: ${TestTitle}"
+ echo "***"
+monshell <<eof
  startup
  delay 3
  exec {name \$PPROC, nid 0, out $TRAF_HOME/monitor/test/persistentProc.lst} persistentProc $trace
@@ -350,50 +450,66 @@
  !shutdown
  exit
 eof
-shell -c ps 
-shell -c ps monitor
-if ( [ $test '==' -1 ] ); then
-shell -a<<eof
+
+#monshell -nid 0 -c ps
+#monshell -nid 0 -c ps monitor
+#monshell -nid 0 -c ps;monshell -nid 1 -c ps;monshell -nid 2 -c ps;monshell -nid 3 -c ps;monshell -nid 4 -c ps;monshell -nid 5 -c ps
+#monshell -nid 0 -c ps monitor;monshell -nid 1 -c ps monitor;monshell -nid 2 -c ps monitor;monshell -nid 3 -c ps monitor;monshell -nid 4 -c ps monitor;monshell -nid 5 -c ps monitor
+
+monshell -a<<eof
+ ps
+ ps monitor
  shutdown
  exit
 eof
-fi
+
   sleep 5
-  cstat -h
+  monpstat
+  Shutdown_status ${TestTitle} >> $PWD/persistentProc.lst
   grep PASSED $PWD/persistentProc.lst
   grep FAILED $PWD/persistentProc.lst
  fi
 fi
 
-#if ( [[ $dtm_persistent_process == "1" ]] &&
-#      ([ $test '==' -1 ] || [ $test '==' 6 ]) ); then
 if ( [[ $dtm_persistent_process == "1" ]] &&
-      ([ $test '==' 6 ]) ); then
- echo "***"
- echo "*** Executing DTM Process test"
- echo "***"
+      ([ $test '==' -1 ] || [ $test '==' 6 ]) ); then
 Cleanup
-shell <<eof
+Genconf 6
+[ $? -eq 0 ] || return 101
+
+TestTitle="DTM Process Test"
+ echo "***"
+ echo "*** Executing: ${TestTitle}"
+ echo "***"
+monshell <<eof
  startup
  delay 3
+ persist info dtm
  exec {name \$DTMCTRL, nid 0, out $TRAF_HOME/monitor/test/dtmTest.lst} dtmCtrl $trace
  delay 3
- !shutdown
+ persist info dtm
  exit
 eof
-shell -c ps 
-shell -c ps monitor
-if ( [ $test '==' -1 ] ); then
-shell -a<<eof
+
+#monshell -nid 0 -c ps
+#monshell -nid 0 -c ps monitor
+#monshell -nid 0 -c ps;monshell -nid 1 -c ps;monshell -nid 2 -c ps;monshell -nid 3 -c ps;monshell -nid 4 -c ps;monshell -nid 5 -c ps
+#monshell -nid 0 -c ps monitor;monshell -nid 1 -c ps monitor;monshell -nid 2 -c ps monitor;monshell -nid 3 -c ps monitor;monshell -nid 4 -c ps monitor;monshell -nid 5 -c ps monitor
+
+monshell -a<<eof
+ ps
+ ps monitor
  shutdown
  exit
 eof
-fi
+
  sleep 5
+ monpstat
+ Shutdown_status ${TestTitle} >> $PWD/dtmTest.lst
  cstat -h
  if [ "$cluster" = "-cluster" ]; then
-     pdsh $MY_NODES grep PASSED $PWD/dtmTest.lst
-     pdsh $MY_NODES grep FAILED $PWD/dtmTest.lst
+     edb_pdsh -a grep PASSED $PWD/dtmTest.lst
+     edb_pdsh -a grep FAILED $PWD/dtmTest.lst
  else
      grep PASSED $PWD/dtmTest.lst
      grep FAILED $PWD/dtmTest.lst
@@ -404,77 +520,57 @@
    fi
 fi
 
-#if ( [ $test '==' -1 ] || [ $test '==' 7 ] ); then
-if ( [ $test '==' 7 ] ); then
- echo "***"
- echo "*** Executing SPX Process test"
- echo "***"
+if ( [ $test '==' -1 ] || [ $test '==' 7 ] ); then
 Cleanup
-shell <<eof
- startup
- delay 3
- exec {name \$SPXCTRL, nid 0, out $TRAF_HOME/monitor/test/spxTest.lst} spxCtrl $trace
- delay 3
- !shutdown
- exit
-eof
-shell -c ps 
-shell -c ps monitor
-if ( [ $test '==' -1 ] ); then
-shell -a<<eof
- shutdown
- exit
-eof
-fi
- sleep 5
- cstat -h
- if [ "$cluster" = "-cluster" ]; then
-     pdsh $MY_NODES grep PASSED $PWD/spxTest.lst
-     pdsh $MY_NODES grep FAILED $PWD/spxTest.lst
- else
-     grep PASSED $PWD/spxTest.lst
-     grep FAILED $PWD/spxTest.lst
- fi
-fi
+Genconf 7
+[ $? -eq 0 ] || return 101
 
-if ( [ $test '==' -1 ] || [ $test '==' 8 ] ); then
+TestTitle="Process Create Test"
  echo "***"
- echo "*** Executing Process Create test"
+ echo "*** Executing: ${TestTitle}"
  echo "***"
-Cleanup
-shell <<eof
+monshell <<eof
  startup
  delay 3
  exec {name \$PCRE8, nid 0, out $TRAF_HOME/monitor/test/procCreate.lst} procCreate $trace -x
  delay 3
- !shutdown
  exit
 eof
-shell -c ps 
-shell -c ps monitor
-if ( [ $test '==' -1 ] ); then
-shell -a<<eof
+
+#monshell -nid 0 -c ps
+#monshell -nid 0 -c ps monitor
+#monshell -nid 0 -c ps;monshell -nid 1 -c ps;monshell -nid 2 -c ps;monshell -nid 3 -c ps;monshell -nid 4 -c ps;monshell -nid 5 -c ps
+#monshell -nid 0 -c ps monitor;monshell -nid 1 -c ps monitor;monshell -nid 2 -c ps monitor;monshell -nid 3 -c ps monitor;monshell -nid 4 -c ps monitor;monshell -nid 5 -c ps monitor
+
+monshell -a<<eof
+ ps
+ ps monitor
  shutdown
  exit
 eof
-fi
+
  sleep 5
- cstat -h
+ monpstat
+ Shutdown_status ${TestTitle} >> $PWD/procCreate.lst
  if [ "$cluster" = "-cluster" ]; then
-     pdsh $MY_NODES grep PASSED $PWD/procCreate.lst
-     pdsh $MY_NODES grep FAILED $PWD/procCreate.lst
+     edb_pdsh -a grep PASSED $PWD/procCreate.lst
+     edb_pdsh -a grep FAILED $PWD/procCreate.lst
  else
      grep PASSED $PWD/procCreate.lst
      grep FAILED $PWD/procCreate.lst
  fi
 fi
 
-if ( [ $test '==' -1 ] || [ $test '==' 9 ] ); then
- echo "***"
- echo "*** Executing Process Creates Node down before startup test"
- echo "***"
+if ( [ $test '==' -1 ] || [ $test '==' 8 ] ); then
 Cleanup
-shell <<eof 
+Genconf 8
+[ $? -eq 0 ] || return 101
+
+TestTitle="Node Down Before Startup Test"
+ echo "***"
+ echo "*** Executing: ${TestTitle}"
+ echo "***"
+monshell<<eof 
  startup
  delay 3
  exec {nowait, name \$PCRE8, nid 0, out $TRAF_HOME/monitor/test/nodeDown.lst} procCreate $trace -y
@@ -484,173 +580,53 @@
  !shutdown
  exit
 eof
-shell -c ps 
-shell -c ps monitor
-if ( [ $test '==' -1 ] ); then
-shell -a<<eof
+
+#monshell -nid 0 -c ps
+#monshell -nid 0 -c ps monitor
+#monshell -nid 0 -c ps;monshell -nid 1 -c ps;monshell -nid 2 -c ps;monshell -nid 3 -c ps;monshell -nid 4 -c ps;monshell -nid 5 -c ps
+#monshell -nid 0 -c ps monitor;monshell -nid 1 -c ps monitor;monshell -nid 2 -c ps monitor;monshell -nid 3 -c ps monitor;monshell -nid 4 -c ps monitor;monshell -nid 5 -c ps monitor
+
+monshell -a<<eof
+ ps
+ ps monitor
  shutdown
  exit
 eof
-fi
+
  sleep 10
+ monpstat
+ Shutdown_status ${TestTitle} >> $PWD/nodeDown.lst
  cstat -h
  if [ "$cluster" = "-cluster" ]; then
-     pdsh $MY_NODES grep PASSED $PWD/nodeDown.lst
-     pdsh $MY_NODES grep FAILED $PWD/nodeDown.lst
+     edb_pdsh -a grep PASSED $PWD/nodeDown.lst
+     edb_pdsh -a grep FAILED $PWD/nodeDown.lst
  else
      grep PASSED $PWD/nodeDown.lst
      grep FAILED $PWD/nodeDown.lst
  fi
 fi
 
-
-#if ( [ $test '==' -1 ] || [ $test '==' 10 ] ); then
-if ( [ $test '==' 10 ] ); then
- if [ "$cluster" = "-cluster" ]; then
-  echo "***"
-  echo "*** Executing TmSync cluster test (subtest 7 disabled until it's fixed)"
-  echo "***"
-Cleanup
-shell <<eof
- startup
- delay 3
- exec {nowait, nid 0, name \$CTRLR, out $TRAF_HOME/monitor/test/tmSync.lst} tmSyncCtrl -n 1,3,4,5,6 $trace
- wait $CTRLR
- delay 3
- !shutdown
- exit
-eof
-shell -c ps 
-shell -c ps monitor
-if ( [ $test '==' -1 ] ); then
-shell -a<<eof
- shutdown
- exit
-eof
-fi
-  sleep 5
-  cstat -h
-  pdsh $MY_NODES grep PASSED $PWD/tmSync.lst
-  pdsh $MY_NODES grep FAILED $PWD/tmSync.lst
-  echo "***"
-  echo "*** Executing TmSync cluster test 10"
-  echo "***"
-Cleanup
-shell <<eof
- startup
- delay 3
- exec {nowait, nid 0, name \$CTRLR, out $TRAF_HOME/monitor/test/tmSync8.lst} tmSyncCtrl -n 8 $trace
- wait $CTRLR
- delay 3
- !shutdown
- exit
-eof
-shell -c ps 
-shell -c ps monitor
-if ( [ $test '==' -1 ] ); then
-shell -a<<eof
- shutdown
- exit
-eof
-fi
-  sleep 5
-  cstat -h
-  pdsh $MY_NODES grep PASSED $PWD/tmSync9.lst
-  pdsh $MY_NODES grep FAILED $PWD/tmSync9.lst
-  echo "***"
-  echo "*** Executing TmSync cluster test 10"
-  echo "***"
-Cleanup
-shell <<eof
- startup
- delay 3
- down 2 !
- delay 15
- exec {nowait, nid 0, name \$CTRLR, out $TRAF_HOME/monitor/test/tmSync10.lst} tmSyncCtrl -n 10 $trace
- wait $CTRLR
- delay 3
- !shutdown
- exit
-eof
-shell -c ps 
-shell -c ps monitor
-if ( [ $test '==' -1 ] ); then
-shell -a<<eof
- shutdown
- exit
-eof
-fi
-  sleep 5
-  cstat -h
-  pdsh $MY_NODES grep PASSED $PWD/tmSync10.lst
-  pdsh $MY_NODES grep FAILED $PWD/tmSync10.lst
- else
-  echo "***"
-  echo "*** Executing TmSync test"
-  echo "***"
-Cleanup
-shell <<eof
- startup
- delay 3
- exec {nowait, nid 0, name \$CTRLR, out $TRAF_HOME/monitor/test/tmSync.lst} tmSyncCtrl -n 3,4,5,6,7 $trace
- wait $CTRLR
- delay 3
- !shutdown
- exit
-eof
-shell -c ps 
-shell -c ps monitor
-if ( [ $test '==' -1 ] ); then
-shell -a<<eof
- shutdown
- exit
-eof
-fi
-  # Reset SQ virtual cluster parameters
-  unset SQ_VIRTUAL_NODES
-  unset SQ_VIRTUAL_NID
- fi
- 
- sleep 5
- cstat -h
- if [ "$cluster" = "-cluster" ]; then
-     pdsh $MY_NODES grep PASSED $PWD/tmSync.lst
-     pdsh $MY_NODES grep FAILED $PWD/tmSync.lst
- else
-     grep PASSED $PWD/tmSync.lst
-     grep FAILED $PWD/tmSync.lst
- fi
-fi
-
 if ( [ $test '==' -1 ] ); then
  echo "***"
  echo "*** Monitor Test Results"
  echo "***"
  if [ "$cluster" = "-cluster" ]; then
-     pdsh $MY_NODES grep PASSED $PWD/childExit.lst
-     pdsh $MY_NODES grep FAILED $PWD/childExit.lst
-     pdsh $MY_NODES grep PASSED $PWD/multiNode.lst
-     pdsh $MY_NODES grep FAILED $PWD/multiNode.lst
-     pdsh $MY_NODES grep PASSED $PWD/regTest.lst
-     pdsh $MY_NODES grep FAILED $PWD/regTest.lst
-     pdsh $MY_NODES grep PASSED $PWD/deathNotice.lst
-     pdsh $MY_NODES grep FAILED $PWD/deathNotice.lst
-#     pdsh $MY_NODES grep PASSED $PWD/persistentProc.lst
-#     pdsh $MY_NODES grep FAILED $PWD/persistentProc.lst
-#     pdsh $MY_NODES grep PASSED $PWD/dtmTest.lst
-#     pdsh $MY_NODES grep FAILED $PWD/dtmTest.lst
-#     pdsh $MY_NODES grep PASSED $PWD/spxTest.lst
-#     pdsh $MY_NODES grep FAILED $PWD/spxTest.lst
-     pdsh $MY_NODES grep PASSED $PWD/procCreate.lst
-     pdsh $MY_NODES grep FAILED $PWD/procCreate.lst
-     pdsh $MY_NODES grep PASSED $PWD/nodeDown.lst
-     pdsh $MY_NODES grep FAILED $PWD/nodeDown.lst
-#     pdsh $MY_NODES grep PASSED $PWD/tmSync.lst
-#     pdsh $MY_NODES grep FAILED $PWD/tmSync.lst
-#     pdsh $MY_NODES grep PASSED $PWD/tmSync9.lst
-#     pdsh $MY_NODES grep FAILED $PWD/tmSync9.lst
-#     pdsh $MY_NODES grep PASSED $PWD/tmSync10.lst
-#     pdsh $MY_NODES grep FAILED $PWD/tmSync10.lst
+     edb_pdsh -a grep PASSED $PWD/childExit.lst
+     edb_pdsh -a grep FAILED $PWD/childExit.lst
+     edb_pdsh -a grep PASSED $PWD/multiNode.lst
+     edb_pdsh -a grep FAILED $PWD/multiNode.lst
+     edb_pdsh -a grep PASSED $PWD/regTest.lst
+     edb_pdsh -a grep FAILED $PWD/regTest.lst
+     edb_pdsh -a grep PASSED $PWD/deathNotice.lst
+     edb_pdsh -a grep FAILED $PWD/deathNotice.lst
+     edb_pdsh -a grep PASSED $PWD/persistentProc.lst
+     edb_pdsh -a grep FAILED $PWD/persistentProc.lst
+     edb_pdsh -a grep PASSED $PWD/dtmTest.lst
+     edb_pdsh -a grep FAILED $PWD/dtmTest.lst
+     edb_pdsh -a grep PASSED $PWD/procCreate.lst
+     edb_pdsh -a grep FAILED $PWD/procCreate.lst
+     edb_pdsh -a grep PASSED $PWD/nodeDown.lst
+     edb_pdsh -a grep FAILED $PWD/nodeDown.lst
  else
      grep PASSED $PWD/childExit.lst
      grep FAILED $PWD/childExit.lst
@@ -660,17 +636,13 @@
      grep FAILED $PWD/regTest.lst
      grep PASSED $PWD/deathNotice.lst
      grep FAILED $PWD/deathNotice.lst
-#     grep PASSED $PWD/persistentProc.lst
-#     grep FAILED $PWD/persistentProc.lst
-#     grep PASSED $PWD/dtmTest.lst
-#     grep FAILED $PWD/dtmTest.lst
-#     grep PASSED $PWD/spxTest.lst
-#     grep FAILED $PWD/spxTest.lst
+     grep PASSED $PWD/persistentProc.lst
+     grep FAILED $PWD/persistentProc.lst
+     grep PASSED $PWD/dtmTest.lst
+     grep FAILED $PWD/dtmTest.lst
      grep PASSED $PWD/procCreate.lst
      grep FAILED $PWD/procCreate.lst
      grep PASSED $PWD/nodeDown.lst
      grep FAILED $PWD/nodeDown.lst
-#     grep PASSED $PWD/tmSync.lst
-#     grep FAILED $PWD/tmSync.lst
  fi
 fi
diff --git a/core/sqf/monitor/test/server.cxx b/core/sqf/monitor/test/server.cxx
old mode 100755
new mode 100644
index 014e9a5..2b13373
--- a/core/sqf/monitor/test/server.cxx
+++ b/core/sqf/monitor/test/server.cxx
@@ -25,6 +25,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 
 #include "clio.h"
 #include "sqevlog/evl_sqlog_writer.h"
diff --git a/core/sqf/monitor/test/sqconfig.persist b/core/sqf/monitor/test/sqconfig.persist
new file mode 100644
index 0000000..f1a8024
--- /dev/null
+++ b/core/sqf/monitor/test/sqconfig.persist
@@ -0,0 +1,88 @@
+# @@@ START COPYRIGHT @@@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# @@@ END COPYRIGHT @@@
+
+begin persist
+
+#
+# The following PROCESS_TYPEs are persistent and have special process management:
+#
+#    DTM
+#    TMID
+#    SSMP
+#    PSD
+#    WDG
+#    TNS
+#
+# Generic persistent processes are of PROCESS_TYPE:
+#
+#    PERSIST
+#
+
+PERSIST_PROCESS_KEYS = ABC,DTM,PP,PSD,WDG
+#PERSIST_PROCESS_KEYS = DTM,PSD,WDG
+
+ABC_PROCESS_NAME      = $ABC
+ABC_PROCESS_TYPE      = PERSIST 
+ABC_PROGRAM_NAME      = dtmProc
+ABC_PROGRAM_ARGS      =
+ABC_REQUIRES_DTM      = N
+ABC_STDOUT            = stdout_ABC
+ABC_PERSIST_RETRIES   = 1,10
+ABC_PERSIST_ZONES     = %zid+
+
+DTM_PROCESS_NAME     = $TM%nid+
+DTM_PROCESS_TYPE     = DTM
+DTM_PROGRAM_NAME     = dtm
+DTM_PROGRAM_ARGS     = -t
+DTM_REQUIRES_DTM     = N
+DTM_STDOUT           = stdout_DTM%nid
+DTM_PERSIST_RETRIES  = 2,30
+DTM_PERSIST_ZONES    = %zid
+
+PP_PROCESS_NAME      = $PP%nid+
+PP_PROCESS_TYPE      = PERSIST
+PP_PROGRAM_NAME      = dtmProc
+PP_PROGRAM_ARGS      =
+PP_REQUIRES_DTM      = N
+PP_STDOUT            = stdout_PP%nid
+PP_PERSIST_RETRIES   = 2,30
+PP_PERSIST_ZONES     = %zid
+
+PSD_PROCESS_NAME     = $PSD%nid+
+PSD_PROCESS_TYPE     = PSD
+PSD_PROGRAM_NAME     = pstartd
+PSD_PROGRAM_ARGS     =
+PSD_REQUIRES_DTM     = N
+PSD_STDOUT           = stdout_PSD%nid
+PSD_PERSIST_RETRIES  = 10,60
+PSD_PERSIST_ZONES    = %zid
+            
+WDG_PROCESS_NAME     = $WDG%nid+
+WDG_PROCESS_TYPE     = WDG
+WDG_PROGRAM_NAME     = sqwatchdog
+WDG_PROGRAM_ARGS     =
+WDG_REQUIRES_DTM     = N
+WDG_STDOUT           = stdout_WDG%nid
+WDG_PERSIST_RETRIES  = 10,60
+WDG_PERSIST_ZONES    = %zid
+
+end persist
+
diff --git a/core/sqf/monitor/test/sqconfig.persist.dtm b/core/sqf/monitor/test/sqconfig.persist.dtm
new file mode 100644
index 0000000..8bbe50d
--- /dev/null
+++ b/core/sqf/monitor/test/sqconfig.persist.dtm
@@ -0,0 +1,88 @@
+# @@@ START COPYRIGHT @@@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# @@@ END COPYRIGHT @@@
+
+begin persist
+
+#
+# The following PROCESS_TYPEs are persistent and have special process management:
+#
+#    DTM
+#    TMID
+#    SSMP
+#    PSD
+#    WDG
+#    TNS
+#
+# Generic persistent processes are of PROCESS_TYPE:
+#
+#    PERSIST
+#
+
+PERSIST_PROCESS_KEYS = ABC,DTM,PP,PSD,WDG
+#PERSIST_PROCESS_KEYS = DTM,PSD,WDG
+
+ABC_PROCESS_NAME      = $ABC
+ABC_PROCESS_TYPE      = PERSIST 
+ABC_PROGRAM_NAME      = dtmProc
+ABC_PROGRAM_ARGS      =
+ABC_REQUIRES_DTM      = N
+ABC_STDOUT            = stdout_ABC
+ABC_PERSIST_RETRIES   = 1,10
+ABC_PERSIST_ZONES     = %zid+
+
+DTM_PROCESS_NAME     = $TM%nid+
+DTM_PROCESS_TYPE     = DTM
+DTM_PROGRAM_NAME     = dtmProc
+DTM_PROGRAM_ARGS     = -t
+DTM_REQUIRES_DTM     = N
+DTM_STDOUT           = stdout_DTM%nid
+DTM_PERSIST_RETRIES  = 2,30
+DTM_PERSIST_ZONES    = %zid
+
+PP_PROCESS_NAME      = $PP%nid+
+PP_PROCESS_TYPE      = PERSIST
+PP_PROGRAM_NAME      = dtmProc
+PP_PROGRAM_ARGS      =
+PP_REQUIRES_DTM      = N
+PP_STDOUT            = stdout_PP%nid
+PP_PERSIST_RETRIES   = 2,30
+PP_PERSIST_ZONES     = %zid
+
+PSD_PROCESS_NAME     = $PSD%nid+
+PSD_PROCESS_TYPE     = PSD
+PSD_PROGRAM_NAME     = pstartd
+PSD_PROGRAM_ARGS     =
+PSD_REQUIRES_DTM     = N
+PSD_STDOUT           = stdout_PSD%nid
+PSD_PERSIST_RETRIES  = 10,60
+PSD_PERSIST_ZONES    = %zid
+            
+WDG_PROCESS_NAME     = $WDG%nid+
+WDG_PROCESS_TYPE     = WDG
+WDG_PROGRAM_NAME     = sqwatchdog
+WDG_PROGRAM_ARGS     =
+WDG_REQUIRES_DTM     = N
+WDG_STDOUT           = stdout_WDG%nid
+WDG_PERSIST_RETRIES  = 10,60
+WDG_PERSIST_ZONES    = %zid
+
+end persist
+
diff --git a/core/sqf/monitor/test/xmpi.cxx b/core/sqf/monitor/test/xmpi.cxx
old mode 100755
new mode 100644
index ebf4be2..a5aa2e4
--- a/core/sqf/monitor/test/xmpi.cxx
+++ b/core/sqf/monitor/test/xmpi.cxx
@@ -52,7 +52,7 @@
 
 static void xmpi_chk_valid_comm(MPI_Comm pv_comm)
 {
-    assert((pv_comm & XMPI_COMM_MASK) == XMPI_COMM_MASK);
+    //assert((pv_comm & XMPI_COMM_MASK) == XMPI_COMM_MASK);
     assert((pv_comm & ~XMPI_COMM_MASK) > 0);
 }
 
@@ -616,7 +616,7 @@
               MPI_Status   *status)
 {
     const char *WHERE = "XMPI_Recv";
-    int recvCount;
+    int recvCount = 0;
 
     if (gv_xmpi_trace)
         printf("%s: ENTER buf=%p, count=%d, comm=0x%x\n", WHERE, buf, count, comm);
diff --git a/core/sqf/monitor/test/xmpi.h b/core/sqf/monitor/test/xmpi.h
old mode 100755
new mode 100644
diff --git a/core/sqf/sqenvcom.sh b/core/sqf/sqenvcom.sh
index 0abb060..feda6fb 100644
--- a/core/sqf/sqenvcom.sh
+++ b/core/sqf/sqenvcom.sh
@@ -37,7 +37,7 @@
 export TRAFODION_VER="${TRAFODION_VER_MAJOR}.${TRAFODION_VER_MINOR}.${TRAFODION_VER_UPDATE}"
 
 # Product copyright header
-export PRODUCT_COPYRIGHT_HEADER="2015-2018 Apache Software Foundation"
+export PRODUCT_COPYRIGHT_HEADER="2015-2019 Apache Software Foundation"
 ##############################################################
 # Trafodion authentication:
 #    Set TRAFODION_ENABLE_AUTHENTICATION to YES to enable
@@ -74,8 +74,7 @@
 if [[ -z "$SQ_VERBOSE" ]]; then
   SQ_VERBOSE=0
 fi
-# temp variable for 64 bit cluster testing
-# export SQ_WDT_KEEPALIVETIMERVALUE=900
+
 #envvar to limit the number of memory arenas
 export MALLOC_ARENA_MAX=1
 
@@ -670,98 +669,6 @@
 # Control SQ default startup behavior (c=cold, w=warm, if removed sqstart will autocheck)
 export SQ_STARTUP=r
 
-#
-# NOTE: in a Python installation when SQ_MON_RUN_MODE below
-#       is AGENT the SQ_MON_CREATOR must be MPIRUN
-#
-#   MPIRUN - monitor process is created by mpirun
-#            (meaning that mpirun is the parent process of the monitor process)
-#   AGENT  - monitor process runs in agent mode versus MPI collective
-#
-# Uncomment the next environment variable
-#export SQ_MON_CREATOR=MPIRUN
-if [[ "$SQ_MON_CREATOR" == "MPIRUN" ]]; then
-  export SQ_MON_RUN_MODE=${SQ_MON_RUN_MODE:-AGENT}
-  export MONITOR_COMM_PORT=${MONITOR_COMM_PORT:-23390}
-  export MONITOR_SYNC_PORT=${MONITOR_SYNC_PORT:-23380}
-  export TRAF_SCALING_FACTOR=${TRAF_SCALING_FACTOR:-0.75}
-fi
-
-#
-#   NAME-SERVER - to disable process replication and enable the name-server
-#
-# Uncomment the next environment variable
-#export SQ_NAMESERVER_ENABLED=1
-if [[ "$SQ_NAMESERVER_ENABLED" == "1" ]]; then
-  export NS_COMM_PORT=${NS_COMM_PORT:-23370}
-  export NS_SYNC_PORT=${NS_SYNC_PORT:-23360}
-  export NS_M2N_COMM_PORT=${NS_M2N_COMM_PORT:-23350}
-  export MON2MON_COMM_PORT=${MON2MON_COMM_PORT:-23340}
-fi
-
-# Alternative logging capability in monitor
-export SQ_MON_ALTLOG=0
-
-# Monitor sync thread responsiveness timeout
-# default 15 mins
-export SQ_MON_SYNC_TIMEOUT=900
-
-export SQ_MON_KEEPALIVE=1
-export SQ_MON_KEEPIDLE=60
-export SQ_MON_KEEPINTVL=6
-export SQ_MON_KEEPCNT=5
-
-# Monitor sync thread epoll wait timeout is in seconds
-# Currently set to 64 seconds (16 second timeout, 4 retries)
-export SQ_MON_EPOLL_WAIT_TIMEOUT=16
-export SQ_MON_EPOLL_RETRY_COUNT=4
-
-# Monitor Zookeeper client
-#  - A zero value disables the zclient logic in the monitor process.
-#    It is enabled by default in a real cluster, disabled otherwise.
-#      (must be disabled to debug monitor processes in a real cluster)
-#export SQ_MON_ZCLIENT_ENABLED=0
-#  - Session timeout in seconds defines when Zookeeper quorum determines a
-#    non-responsive monitor zclient which results in a Trafodion node down. 
-#    Default is 60 seconds (1 minute) which is the maximum Zookeeper allows.
-#export SQ_MON_ZCLIENT_SESSION_TIMEOUT=60
-#  - My znode monitoring timeout in seconds defines frequency when local
-#    monitor's znode is checked. Uncomment to override default value.
-#    Default is 5 seconds.
-#export SQ_MON_ZCLIENT_MY_ZNODE_CHECKRATE=5
-
-# Trafodion Configuration Zookeeper store
-#export TC_ZCONFIG_SESSION_TIMEOUT=120
-
-# increase SQ_MON,ZCLIENT,WDT timeout only to jenkins env.
-if [[ "$TRAF_HOME" == *"/home/jenkins"* ]]; then
-export SQ_MON_EPOLL_WAIT_TIMEOUT=20
-export SQ_MON_ZCLIENT_SESSION_TIMEOUT=360
-export SQ_WDT_KEEPALIVETIMERVALUE=360
-fi
-
-# set to 0 to disable phandle verifier
-export SQ_PHANDLE_VERIFIER=1
-
-# set to 0 to disable process name long format in clusters larger that 256 nodes
-#export SQ_MON_PROCESS_NAME_FORMAT_LONG=0
-#   short format: '$Zxxpppp'     xx   = nid, pppp   = pid
-#   long  format: '$Zxxxxpppppp' xxxx = nid, pppppp = pid (default)
-
-# set to 0 to disable or 1 to enable configuration of DTM as a persistent process
-# must re-execute 'sqgen' to effect change
-export SQ_DTM_PERSISTENT_PROCESS=1
-
-# Check the state of the node with the cluster manager during regroup
-export SQ_WDT_CHECK_CLUSTER_STATE=0
-
-# Enable SQ_PIDMAP if you want to get a record of process activity.
-# This can be useful in troubleshooting problems.  There is an overhead cost
-# incurred each time a process is started so do not enable this if performance
-# is critical.
-# Log process start/end messages in $TRAF_VAR/monitor.map
-export SQ_PIDMAP=1
-
 #Enable RMS (SQL Run time statistics)
 export SQ_START_RMS=1
 
@@ -1019,3 +926,129 @@
   echo $CLASSPATH | sed -e's/:/ /g' | fmt -w2 | xargs printf '\t%s\n'
   echo
 fi
+
+###########################
+# Trafodion monitor process
+###########################
+#
+# NOTE: in a Python installation when SQ_MON_RUN_MODE below
+#       is AGENT the SQ_MON_CREATOR must be MPIRUN
+#
+#   MPIRUN - monitor process is created by mpirun
+#            (meaning that mpirun is the parent process of the monitor process)
+#   AGENT  - monitor process runs in agent mode versus MPI collective
+#
+if [[ -z ${TRAF_AGENT} ]]; then
+  if [[ -e $TRAF_CONF/sqconfig ]]; then
+     node_count=`grep -o 'node-name=.[A-Za-z0-9\.\-]*' $TRAF_CONF/sqconfig | cut -d "=" -f 2 | cut -d ";" -f 1 | sort -u | wc -l`
+  else
+     node_count=1
+  fi
+  # Set monitor to run in agent mode is a cluster environment
+  if  [[ -n "$node_count" ]] && [[ "$node_count" -gt "1" ]]; then    
+     export SQ_MON_CREATOR=MPIRUN
+  fi
+  
+  if [[ "$SQ_MON_CREATOR" == "MPIRUN" ]]; then
+    export SQ_MON_RUN_MODE=${SQ_MON_RUN_MODE:-AGENT}
+    export MONITOR_COMM_PORT=${MONITOR_COMM_PORT:-23390}
+    export TRAF_SCALING_FACTOR=${TRAF_SCALING_FACTOR:-0.75}
+  fi
+fi
+  
+#
+#   NAME-SERVER - to disable process replication and enable the name-server
+#
+# Uncomment the next environment variable
+# Set the number of nodes configured
+#export SQ_NAMESERVER_ENABLED=1
+if [[ "$SQ_NAMESERVER_ENABLED" == "1" ]]; then
+  export NS_COMM_PORT=${NS_COMM_PORT:-23370}
+#  export NS_SYNC_PORT=${NS_SYNC_PORT:-23360}
+  export NS_M2N_COMM_PORT=${NS_M2N_COMM_PORT:-23350}
+  export MON2MON_COMM_PORT=${MON2MON_COMM_PORT:-23340}
+fi
+
+# Alternative logging capability in monitor
+export SQ_MON_ALTLOG=0
+
+#
+#   Monitor - Sync Thread 
+#
+# Monitor sync thread responsiveness timeout (default 15 mins)
+export SQ_MON_SYNC_TIMEOUT=${SQ_MON_SYNC_TIMEOUT:-900}
+
+# Monitor sync thread responsiveness logging frequecy (default 1 min)
+export SQ_MON_SYNC_DELAY_LOGGING_FREQUENCY=${SQ_MON_SYNC_DELAY_LOGGING_FREQUENCY:-60}
+
+# Monitor sync thread threshold (default 20% of SQ_MON_SYNC_TIMEOUT, maximum is 50%)
+export SQ_MON_SYNC_DELAY_LOGGING_THRESHOLD=${SQ_MON_SYNC_DELAY_LOGGING_THRESHOLD:-20}
+
+# Using the above defaults, the logging threshold is 180 seconds and a frequency 
+# of every 60 seconds. So the first 'Sync thread not responsive' message is 
+# logged after 3 minutes (180 seconds) and every minute (60 seconds) after.
+
+export SQ_MON_KEEPALIVE=1
+export SQ_MON_KEEPIDLE=60
+export SQ_MON_KEEPINTVL=6
+export SQ_MON_KEEPCNT=5
+
+# Monitor sync thread epoll wait timeout is in seconds
+# Currently set to 64 seconds (16 second timeout, 4 retries)
+export SQ_MON_EPOLL_WAIT_TIMEOUT=${SQ_MON_EPOLL_WAIT_TIMEOUT:-16}
+export SQ_MON_EPOLL_RETRY_COUNT=${SQ_MON_EPOLL_RETRY_COUNT:-4}
+
+# Monitor Zookeeper client
+#  - A zero value disables the zclient logic in the monitor process.
+#    It is enabled by default in a real cluster, disabled otherwise.
+#      (must be disabled to debug monitor processes in a real cluster)
+#export SQ_MON_ZCLIENT_ENABLED=0
+#  - Session timeout in seconds defines when Zookeeper quorum determines a
+#    non-responsive monitor zclient which results in a Trafodion node down. 
+#    Default is 60 seconds (1 minute) which is the maximum Zookeeper allows.
+#export SQ_MON_ZCLIENT_SESSION_TIMEOUT=60
+#  - My znode monitoring timeout in seconds defines frequency when local
+#    monitor's znode is checked. Uncomment to override default value.
+#    Default is 5 seconds.
+#export SQ_MON_ZCLIENT_MY_ZNODE_CHECKRATE=5
+
+# Trafodion Configuration Zookeeper store
+#export TC_ZCONFIG_SESSION_TIMEOUT=120
+
+# sqwatchdog process ($WDTn) timer expiration value (default 60 seconds)
+export SQ_WDT_KEEPALIVETIMERVALUE=${SQ_WDT_KEEPALIVETIMERVALUE:-60}
+
+# increase SQ_MON,ZCLIENT,WDT timeout only to jenkins env.
+if [[ "$TRAF_HOME" == *"/home/jenkins"* ]]; then
+export SQ_MON_EPOLL_WAIT_TIMEOUT=20
+export SQ_MON_ZCLIENT_SESSION_TIMEOUT=360
+export SQ_WDT_KEEPALIVETIMERVALUE=360
+fi
+
+# set to 0 to disable phandle verifier
+export SQ_PHANDLE_VERIFIER=1
+
+# set to 0 to disable process name long format in clusters larger that 256 nodes
+#export SQ_MON_PROCESS_NAME_FORMAT_LONG=0
+#   short format: '$Zxxpppp'     xx   = nid, pppp   = pid
+#   long  format: '$Zxxxxpppppp' xxxx = nid, pppppp = pid (default)
+
+# set to 0 to disable or 1 to enable configuration of DTM as a persistent process
+# must re-execute 'sqgen' to effect change
+export SQ_DTM_PERSISTENT_PROCESS=1
+
+# Check the state of the node with the cluster manager during regroup
+export SQ_WDT_CHECK_CLUSTER_STATE=0
+
+# Enable SQ_PIDMAP if you want to get a record of process activity.
+# This can be useful in troubleshooting problems.  There is an overhead cost
+# incurred each time a process is started so do not enable this if performance
+# is critical.
+# Log process start/end messages in $TRAF_VAR/monitor.map
+export SQ_PIDMAP=1
+
+#################################
+# End - Trafodion monitor process
+#################################
+
+
diff --git a/core/sqf/sql/scripts/gensq.pl b/core/sqf/sql/scripts/gensq.pl
index 0acb12e..1973384 100755
--- a/core/sqf/sql/scripts/gensq.pl
+++ b/core/sqf/sql/scripts/gensq.pl
@@ -109,7 +109,7 @@
         return;
     }
 
-    printScript(1, "#!/bin/sh \n");
+    printScript(1, "#!/bin/bash \n");
     printTime;
 
 #    $smenv = "$ENV{'TRAF_VAR'}/seamonster.env";
@@ -187,10 +187,17 @@
 
 #        printScript(1, "\nset CLUSTERNAME=\$CLUSTERNAME\n");
 #    printScript(1, "\nset SQ_MBTYPE=$ENV{'SQ_MBTYPE'}\n");
-#    printScript(1, "\nset MY_NODES=\$MY_NODES\n");
+#    printScript(1, "\nset JAVA_HOME=\$JAVA_HOME\n");
+#    printScript(1, "\nset TRAF_CLUSTER_ID=\$TRAF_CLUSTER_ID\n");
+#    printScript(1, "\nset TRAF_INSTANCE_ID=\$TRAF_INSTANCE_ID\n");
+#    printScript(1, "\nset TRAF_FOUNDATION_READY=0\n");
 
 #    sqconfigdb::addDbClusterData( "SQ_MBTYPE", $ENV{'SQ_MBTYPE'});
 #    sqconfigdb::addDbClusterData( "TRAF_HOME", "$TRAF_HOME"); # comes out null
+#    sqconfigdb::addDbClusterData( "JAVA_HOME", "$JAVA_HOME"); 
+#    sqconfigdb::addDbClusterData( "TRAF_CLUSTER_ID", "$TRAF_CLUSTER_ID");
+#    sqconfigdb::addDbClusterData( "TRAF_INSTANCE_ID", "$TRAF_INSTANCE_ID");
+#    sqconfigdb::addDbClusterData( "TRAF_FOUNDATION_READY", "0"); 
 
 #    genSQShellExit();
 
@@ -478,7 +485,7 @@
 
     my $file_ptr  = @_[0];
 
-    print $file_ptr "#!/bin/sh\n";
+    print $file_ptr "#!/bin/bash\n";
     print $file_ptr "# Trafodion config/utility file generated @ ", getTime(), "\n";
 }
 
@@ -586,28 +593,6 @@
 
 }
 
-sub setupDbUniqStrings {
-
-    my $my_scripts_dir = "$TRAF_HOME" . "/sql/scripts/" ;
-    for ($i=0; $i < $gdNumNodes; $i++) {
-        sqconfigdb::addDbUniqStr($i, 1, 'shell');
-        sqconfigdb::addDbUniqStr($i, 2, 'pstartd');
-        sqconfigdb::addDbUniqStr($i, 3, 'sqwatchdog');
-        sqconfigdb::addDbUniqStr($i, 4, 'idtmsrv');
-        sqconfigdb::addDbUniqStr($i, 5, 'tm');
-        sqconfigdb::addDbUniqStr($i, 6, 'service_monitor');
-        sqconfigdb::addDbUniqStr($i, 7, 'mxsscp');
-        sqconfigdb::addDbUniqStr($i, 8, 'mxssmp');
-        sqconfigdb::addDbUniqStr($i, 9, 'run_command');
-        sqconfigdb::addDbUniqStr($i, 10, 'mxosrvr');
-        sqconfigdb::addDbUniqStr($i, 11, 'tdm_arkesp');
-        sqconfigdb::addDbUniqStr($i, 12, 'tdm_arkcmp');
-        sqconfigdb::addDbUniqStr($i, 13, 'traf_notify');
-        sqconfigdb::addDbUniqStr($i, 14, 'mxlobsrvr');
-        sqconfigdb::addDbUniqStr($i, 15, 'trafns');
-    }
-}
-
 #
 # Main
 #
@@ -655,8 +640,6 @@
 
 #printZoneList;
 
-setupDbUniqStrings();
-
 printScriptEndLines;
 
 endGame;
diff --git a/core/sqf/sql/scripts/gomon.cold b/core/sqf/sql/scripts/gomon.cold
index ec83310..18835ef 100755
--- a/core/sqf/sql/scripts/gomon.cold
+++ b/core/sqf/sql/scripts/gomon.cold
@@ -1,6 +1,5 @@
 #!/bin/bash
 #
-#/bin/sh
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -23,20 +22,36 @@
 # @@@ END COPYRIGHT @@@
 #
 
+# Set the number of nodes configured
+let node_count=`trafconf -nid-count`
+
 if [[ -z ${TRAF_AGENT} ]]; then
+
+   # Set monitor to run in agent mode
+   if  [[ -n "$node_count" ]] && [[ "$node_count" -gt "1" ]]; then    
+      export SQ_MON_CREATOR=MPIRUN
+   fi
+
+   if [[ "$SQ_MON_CREATOR" == "MPIRUN" ]]; then
+      export SQ_MON_RUN_MODE=${SQ_MON_RUN_MODE:-AGENT}
+      export MONITOR_COMM_PORT=${MONITOR_COMM_PORT:-23390}
+      echo  "***"
+      echo  "***" `date`  " - SQ_MON_CREATOR  = $SQ_MON_CREATOR"
+      echo  "***" `date`  " - SQ_MON_RUN_MODE = $SQ_MON_RUN_MODE"
+   fi
+
    echo  "***"
    echo  "***" `date`  " - Starting Monitor processes"
    echo  "***"
-
+   
 sqshell <<eof 
 
 ! Start the monitor processes across the cluster
 startup
 
 set SQ_MBTYPE=$SQ_MBTYPE
-set MY_NODES=$MY_NODES
 set JAVA_HOME=$JAVA_HOME
-set MY_CLUSTER_ID=$MY_CLUSTER_ID
+set TRAF_CLUSTER_ID=$TRAF_CLUSTER_ID
 set TRAF_FOUNDATION_READY=0
 
 exit
@@ -45,26 +60,16 @@
 else
    echo `date`" - Waiting for the Trafodion monitor process..."
 
-   # 5 seconds, iterations 240 = 20 minutes
-   let loop_count=0
-   let loop_max=240
    let monitor_ready=0
-   while [[ $loop_count -lt $loop_max ]];
-   do
-     if sqcheckmon -f
-     then
-       let ++monitor_ready
-       break
-     else
-       echo -n "."
-       sleep 5
-       sleep $wtime
-       let ++loop_count
-     fi
-   done
+
+   # 5 seconds, iterations 240 = 20 minutes
+   if sqcheckmon -s up -i 240 -d 5
+   then
+      let ++monitor_ready
+   fi
 
    if [[ $monitor_ready -lt 1 ]]; then
-      echo "Aborting startup!"
+      echo `date`" - Aborting startup!"
       cat $TRAF_LOG/sqcheckmon.log
       exit 1
    else
@@ -73,21 +78,20 @@
 sqshell -a <<eof
 
 set SQ_MBTYPE=$SQ_MBTYPE
-set MY_NODES=$MY_NODES
 set JAVA_HOME=$JAVA_HOME
-set MY_CLUSTER_ID=$MY_CLUSTER_ID
+set TRAF_CLUSTER_ID=$TRAF_CLUSTER_ID
 set TRAF_FOUNDATION_READY=0
 
 exit
 eof
-
-      echo `date`" - Delaying 15 seconds before continuing with Startup"
-      sleep 15
-      echo `date`" - Continuing with Startup ..."
-      echo
    fi
 fi
 
+echo `date`" - Delaying 15 seconds before continuing with Startup"
+sleep 15
+echo `date`" - Continuing with Startup ..."
+echo
+
 if (
     [[ $TRAF_AGENT == "CM" ]] ||
     [[ $SQ_MON_RUN_MODE == "AGENT" ]]
@@ -95,8 +99,6 @@
 then
    export TRAF_SCALING_FACTOR=${TRAF_SCALING_FACTOR:-0.75}
 
-   # Set the number of nodes configured
-   let node_count=`trafconf -nid-count`
    #echo  "***"
    #echo  "*** node_count = ${node_count}"
    #echo  "*** TRAF_SCALING_FACTOR = ${TRAF_SCALING_FACTOR}"
@@ -113,7 +115,9 @@
 sqshell -c node info
 
 # Starting TSID
-
+echo  "***"
+echo  "***" `date`  " - Starting Trasaction Id Server (TSID)"
+echo  "***"
 idtmstart
 sqshell -c delay 1
 
diff --git a/core/sqf/sql/scripts/install_apache_hadoop b/core/sqf/sql/scripts/install_apache_hadoop
index d452cc4..ffcbcb5 100755
--- a/core/sqf/sql/scripts/install_apache_hadoop
+++ b/core/sqf/sql/scripts/install_apache_hadoop
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -700,7 +700,7 @@
   # scripts to start/stop environment
   ####################################
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstartall
-#!/bin/sh
+#!/bin/bash
 echo "Starting Hadoop, MySQL, HBase..."
 cd ${MY_SW_SCRIPTS_DIR}
 ./swstarthadoop
@@ -709,7 +709,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstopall
-#!/bin/sh
+#!/bin/bash
 echo "Stopping Hadoop, MySQL, HBase..."
 cd ${MY_SW_SCRIPTS_DIR}
 ./swstophbase
@@ -718,7 +718,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstarthadoop
-#!/bin/sh
+#!/bin/bash
 echo "Starting Hadoop..."
 cd ${MY_SW_ROOT}
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -727,7 +727,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstophadoop
-#!/bin/sh
+#!/bin/bash
 echo "Stopping Hadoop..."
 cd ${MY_SW_ROOT}
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -736,7 +736,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstartmysql
-#!/bin/sh
+#!/bin/bash
 echo "Starting mysqld..."
 cd ${MY_SW_ROOT}/mysql
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -744,7 +744,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstopmysql
-#!/bin/sh
+#!/bin/bash
 echo "Stopping mysqld..."
 cd ${MY_SW_ROOT}/mysql
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -752,7 +752,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstarthbase
-#!/bin/sh
+#!/bin/bash
 echo "Starting HBase..."
 cd ${MY_SW_ROOT}
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -760,14 +760,14 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstophbase
-#!/bin/sh
+#!/bin/bash
 cd ${MY_SW_ROOT}
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
 ./hbase/bin/stop-hbase.sh
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstatus
-#!/bin/sh
+#!/bin/bash
 cd ${MY_SW_ROOT}
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
 NUM_JAVA_PROCS=\`ps -aef | grep \$USER | grep java | grep -v grep | wc -l\`
@@ -788,7 +788,7 @@
   # scripts to start command line interpreters and tools
   #######################################################
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swhadoop
-#!/bin/sh
+#!/bin/bash
 # command to run hadoop
 
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -796,7 +796,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swyarn
-#!/bin/sh
+#!/bin/bash
 # command to run yarn
 
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -804,7 +804,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swhdfs
-#!/bin/sh
+#!/bin/bash
 # command to run hadoop
 
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -812,7 +812,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swmysql
-#!/bin/sh
+#!/bin/bash
 # command to run mysql
 
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -820,7 +820,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swmysqladmin
-#!/bin/sh
+#!/bin/bash
 # command to run mysqladmin as root user
 
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -828,7 +828,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swhive
-#!/bin/sh
+#!/bin/bash
 # command to run hive command line interpreter
 
 # Pick up MySQL JDBC driver
@@ -838,7 +838,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swhbase
-#!/bin/sh
+#!/bin/bash
 # command to run hbase shell
 
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -846,7 +846,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swuninstall_local_hadoop
-#!/bin/sh
+#!/bin/bash
 # uninstall local Hadoop instance.
 
 . $MY_SW_SCRIPTS_DIR/swstopall
@@ -1648,7 +1648,7 @@
   then
     echo "Adding swjdbc script...." | tee -a ${MY_LOG_FILE}
     cat <<EOF >$MY_SW_SCRIPTS_DIR/swjdbc
-#!/bin/sh
+#!/bin/bash
 # command to run JDBC tests
 cd $DCS_SRC/src/test/jdbc_test
 ./jdbc_test.py --appid=jdbc_test --user=SOMEUSER --pw=SOMEPASSWORD --javahome=\$JAVA_HOME \\
@@ -1686,7 +1686,7 @@
   # existing local_hadoop installations
   echo "Adding swphoenix script...." | tee -a ${MY_LOG_FILE}
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swphoenix
-#!/bin/sh
+#!/bin/bash
 # command to run phoenix tests
 
 cd $PHX_SRC
diff --git a/core/sqf/sql/scripts/install_local_hadoop b/core/sqf/sql/scripts/install_local_hadoop
index 9055c70..b6b4c0b 100755
--- a/core/sqf/sql/scripts/install_local_hadoop
+++ b/core/sqf/sql/scripts/install_local_hadoop
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -361,7 +361,7 @@
 *)  echo "ERROR: Unexpected argument $1"
     echo
     cat <<EOF
-Syntax: $0 [ -p [<starting port num> | rand | fromDisplay | available ] ]  [-y]  [-n] [-v]
+Syntax: $0 [ -p [<starting port num> | rand | fromDisplay | available] ]  [-y]  [-n] [-v]
 EOF
     exit 1
     ;;
@@ -397,9 +397,10 @@
 
      portMsg=`python findPort.py`
 
-     missingPort=`echo $portMsg | cut -d':' -f 1-1`
+echo $portMsg
+     missingPort=`echo $portMsg | cut -d':' -f 2-`
 
-     if [[ $missingPort != "Port ranges not in use" ]]; then
+     if [[ "$missingPort" == "" ]]; then
         echo "No free port available, exit"
         exit 1
      fi
@@ -632,8 +633,8 @@
 fi
 
 echo "Checking for existing Hadoop processes..."
-if [ `netstat -anl | grep ${MY_HADOOP_JOB_TRACKER_HTTP_PORT_NUM} | grep LISTEN | wc -l` -gt 0 -o \
-     `netstat -anl | grep ${MY_HADOOP_NN_HTTP_PORT_NUM} | grep LISTEN | wc -l` -gt 0 ]; then
+if [ `netstat -anl | grep ${MY_HADOOP_JOB_TRACKER_HTTP_PORT_NUM} | grep -w LISTEN | wc -l` -gt 0 -o \
+     `netstat -anl | grep ${MY_HADOOP_NN_HTTP_PORT_NUM} | grep -w LISTEN | wc -l` -gt 0 ]; then
   echo '**** ERROR:'
   echo "A process is already listening to port ${MY_HADOOP_JOB_TRACKER_HTTP_PORT_NUM} or ${MY_HADOOP_NN_HTTP_PORT_NUM}."
   echo "This could be your own HDFS web interface or that of someone else."
@@ -778,7 +779,7 @@
   cat <<EOF >$MY_SW_SCRIPTS_DIR/sw_env.sh
 # Basic environment variables for Trafodion/Hadoop/Hive/HBase/MySQL setup
 export JAVA_HOME=${JAVA_HOME}
-export JAVA_LIBRARY_PATH=${LD_LIBRARY_PATH}
+export JAVA_LIBRARY_PATH=\${LD_LIBRARY_PATH}
 export MY_SW_SCRIPTS_DIR=${MY_SW_SCRIPTS_DIR}
 export MY_SW_ROOT=${MY_SW_ROOT}
 export MYSQL_HOME=${MYSQL_HOME}
@@ -802,6 +803,14 @@
 export MY_DRILL_INFO_PORT_NUM=${MY_DRILL_INFO_PORT_NUM}
 export MY_DRILL_RPC_PORT_NUM=${MY_DRILL_RPC_PORT_NUM}
 export MY_DRILL_BIT_PORT_NUM=${MY_DRILL_BIT_PORT_NUM}
+export ZOOKEEPER_NODES=localhost
+export ZOOKEEPER_PORT=${MY_HBASE_ZOOKEEPER_PROPERTY_CLIENTPORT_NUM}
+export TRAF_CLUSTER_ID=1
+export TRAF_CLUSTER_NAME=$USER
+export TRAF_INSTANCE_ID=1
+export TRAF_INSTANCE_NAME=TRAFODION
+export TRAF_ROOT_ZNODE=/trafodion
+
 # HBase heap size is in MB
 export HBASE_HEAPSIZE=2048
 EOF
@@ -813,7 +822,7 @@
   # scripts to start/stop environment
   ####################################
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstartall
-#!/bin/sh
+#!/bin/bash
 echo "Starting Hadoop, MySQL, HBase..."
 cd ${MY_SW_SCRIPTS_DIR}
 ./swstarthadoop
@@ -822,7 +831,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstopall
-#!/bin/sh
+#!/bin/bash
 echo "Stopping Hadoop, MySQL, HBase..."
 cd ${MY_SW_SCRIPTS_DIR}
 ./swstophbase
@@ -831,7 +840,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstarthadoop
-#!/bin/sh
+#!/bin/bash
 echo "Starting Hadoop..."
 cd ${MY_SW_ROOT}
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -840,7 +849,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstophadoop
-#!/bin/sh
+#!/bin/bash
 echo "Stopping Hadoop..."
 cd ${MY_SW_ROOT}
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -849,7 +858,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstartmysql
-#!/bin/sh
+#!/bin/bash
 echo "Starting mysqld..."
 cd ${MY_SW_ROOT}/mysql
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -857,7 +866,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstopmysql
-#!/bin/sh
+#!/bin/bash
 echo "Stopping mysqld..."
 cd ${MY_SW_ROOT}/mysql
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -865,7 +874,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstarthbase
-#!/bin/sh
+#!/bin/bash
 echo "Starting HBase..."
 cd ${MY_SW_ROOT}/hbase
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -873,14 +882,14 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstophbase
-#!/bin/sh
+#!/bin/bash
 cd ${MY_SW_ROOT}/hbase
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
 bin/stop-hbase.sh
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swstatus
-#!/bin/sh
+#!/bin/bash
 cd \${MY_SW_ROOT}
 . \$MY_SW_SCRIPTS_DIR/sw_env.sh
 JPS_OUTPUT=\`jps\`
@@ -912,7 +921,7 @@
   # scripts to start command line interpreters and tools
   #######################################################
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swhadoop
-#!/bin/sh
+#!/bin/bash
 # command to run hadoop
 
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -920,7 +929,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swyarn
-#!/bin/sh
+#!/bin/bash
 # command to run yarn
 
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -928,7 +937,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swhdfs
-#!/bin/sh
+#!/bin/bash
 # command to run hadoop
 
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -936,7 +945,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swmysql
-#!/bin/sh
+#!/bin/bash
 # command to run mysql
 
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -944,7 +953,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swmysqladmin
-#!/bin/sh
+#!/bin/bash
 # command to run mysqladmin as root user
 
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -952,7 +961,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swhive
-#!/bin/sh
+#!/bin/bash
 # command to run hive command line interpreter
 
 # Pick up MySQL JDBC driver
@@ -968,7 +977,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swhbase
-#!/bin/sh
+#!/bin/bash
 # command to run hbase shell
 
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -976,7 +985,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swzkcli
-#!/bin/sh
+#!/bin/bash
 # command to run hbase zkcli
 
 . $MY_SW_SCRIPTS_DIR/sw_env.sh
@@ -984,7 +993,7 @@
 EOF
 
   cat <<EOF >$MY_SW_SCRIPTS_DIR/swuninstall_local_hadoop
-#!/bin/sh
+#!/bin/bash
 # uninstall local Hadoop instance.
 
 . $MY_SW_SCRIPTS_DIR/swstopall
@@ -1271,14 +1280,13 @@
   bin/hdfs dfs -mkdir /tmp                           >>${MY_LOG_FILE} 2>&1
   bin/hdfs dfs -mkdir /user                          >>${MY_LOG_FILE} 2>&1  
   bin/hdfs dfs -mkdir /user/trafodion                >>${MY_LOG_FILE} 2>&1
-  bin/hdfs dfs -mkdir /user/trafodion/$USER                    >>${MY_LOG_FILE} 2>&1
-  bin/hdfs dfs -mkdir /user/trafodion/hive                     >>${MY_LOG_FILE} 2>&1
+  bin/hdfs dfs -mkdir /user/trafodion/$USER          >>${MY_LOG_FILE} 2>&1
+  bin/hdfs dfs -mkdir /user/trafodion/hive           >>${MY_LOG_FILE} 2>&1
   bin/hdfs dfs -mkdir /user/trafodion/bulkload       >>${MY_LOG_FILE} 2>&1
-  bin/hdfs dfs -mkdir /user/trafodion/hive/warehouse           >>${MY_LOG_FILE} 2>&1
-  bin/hdfs dfs -mkdir /user/trafodion/hive                          >>${MY_LOG_FILE} 2>&1
+  bin/hdfs dfs -mkdir /user/trafodion/hive/warehouse >>${MY_LOG_FILE} 2>&1
+  bin/hdfs dfs -mkdir /user/trafodion/hive           >>${MY_LOG_FILE} 2>&1
   bin/hdfs dfs -chmod g+w /tmp                       >>${MY_LOG_FILE} 2>&1
-  bin/hdfs dfs -chmod g+w /user/trafodion/hive/warehouse       >>${MY_LOG_FILE} 2>&1
-  bin/hdfs dfs -chmod g+w /user/traofdion/bulkload                  >>${MY_LOG_FILE} 2>&1
+  bin/hdfs dfs -chmod g+w /user/trafodion/hive/warehouse   >>${MY_LOG_FILE} 2>&1
   bin/hdfs dfs -chmod g+w /user/trafodion/bulkload   >>${MY_LOG_FILE} 2>&1
   bin/hadoop fs -ls -R /                 2>&1 | tee -a ${MY_LOG_FILE}
   echo "Done: Creating HDFS directories" 2>&1 | tee -a ${MY_LOG_FILE}
@@ -1568,7 +1576,7 @@
   fi
 
   if [[ "$HBASE_DISTRO" =~ "APACHE" ]]; then
-    MASTER_VISBILITY_COPROC="<property>
+    MASTER_VISIBILITY_COPROC="<property>
     <name>hbase.coprocessor.master.classes</name>
     <value>org.apache.hadoop.hbase.security.access.AccessController,
            org.apache.hadoop.hbase.security.visibility.VisibilityController
@@ -1653,7 +1661,7 @@
     <value>true</value>
   </property>
    ${HREGION_IMPL}
-   ${MASTER_VISBILITY_COPROC}
+   ${MASTER_VISIBILITY_COPROC}
 </configuration>
 EOF
 
diff --git a/core/sqf/sql/scripts/monitor.env b/core/sqf/sql/scripts/monitor.env
deleted file mode 100644
index 25029ce..0000000
--- a/core/sqf/sql/scripts/monitor.env
+++ /dev/null
@@ -1,61 +0,0 @@
-# @@@ START COPYRIGHT @@@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# @@@ END COPYRIGHT @@@
-
-# Uncomment MON_TRACE_ENABLE and specific tracing level to enable 
-# Trafodion monitor process tracing
-#MON_TRACE_ENABLE=1
-#MON_TRACE_EVLOG_MSG=1
-#MON_TRACE_INIT=1
-#MON_TRACE_RECOVERY=1
-#MON_TRACE_REQUEST=1
-#MON_TRACE_PROCESS=1
-#MON_TRACE_NOTICE=1
-#MON_TRACE_NS=1
-#MON_TRACE_SYNC=1
-# Enable TC_TRACE_* along with MON_TRACE_TRAFCONFIG for more detail
-#MON_TRACE_TRAFCONFIG=1
-#MON_TRACE_MLIO=1
-
-#MON_TRACE_REQUEST_DETAIL=1
-#MON_TRACE_PROCESS_DETAIL=1
-#MON_TRACE_NOTICE_DETAIL=1
-#MON_TRACE_SYNC_DETAIL=1
-#MON_TRACE_MLIO_DETAIL=1
-
-#MON_TRACE_MEAS=1
-#MON_TRACE_TMSYNC=1
-#MON_TRACE_STATS=1
-#MON_TRACE_ENTRY_EXIT=1
-#MON_TRACE_REDIRECTION=1
-#MON_TRACE_HEALTH=1
-#MON_TRACE_SIG_HANDLER=1
-
-# Uncomment TC_TRACE_ENABLE and specific tracing level to enable 
-# Trafodion Configuration library tracing
-# TC_TRACE_ENABLE requires MON_TRACE_TRAFCONFIG above
-#TC_TRACE_ENABLE=1
-#TC_TRACE_REQUEST=1
-#TC_TRACE_NODE=1
-#TC_TRACE_PERSIST=1
-#TC_TRACE_REGISTRY=1
-#TC_TRACE_INIT=1
-#TC_TRACE_LOG_MSG=1
-#TC_TRACE_ENTRY_EXIT=1
diff --git a/core/sqf/sql/scripts/sqgen b/core/sqf/sql/scripts/sqgen
index 839152f..12ceb1e 100755
--- a/core/sqf/sql/scripts/sqgen
+++ b/core/sqf/sql/scripts/sqgen
@@ -24,16 +24,18 @@
 # sqgen script - generates various files
 
 SQCONFIG_FILE=$TRAF_CONF/sqconfig
+SQCONFIG_PERSIST_FILE=$TRAF_CONF/sqconfig.persist
 
 function Usage {
-    script_name=`/bin/basename $0`
+    script_name=`basename $0`
     echo
     echo $script_name generates various Trafodion files in the $TRAF_VAR directory.
     echo
-    echo "Usage: $script_name [ -? | -h ] [<sqconfig_filename>]"
+    echo "Usage: $script_name [ -? | -h ] [<sqconfig_filename> <sqconfig_persist_filename>]"
     echo "  -?    Help"
     echo "  -h    Help"
-    echo "  <sqconfig_filename> Name of the SQ config file (defaults to $SQCONFIG_FILE)"
+    echo "  <sqconfig_filename>         Name of the SQ config file (defaults to $SQCONFIG_FILE)"
+    echo "  <sqconfig_persist_filename> Name of the SQ config persist file (defaults to $SQCONFIG_PERSIST_FILE)"
     echo
     exit 1;
 }
@@ -47,17 +49,19 @@
    i=0
    for NODE in $TempList
      do
+       if [[ ${NODE%%.*} == "$(hostname -s)" ]]; then
+         continue
+       fi
        SQNodeNames[$i]=$NODE
        ((i=i+1))
 
      done
 
    # Check that the Node names were correctly added
-   NumberOfSQnodes=${#SQNodeNames[*]}
-   ExNodeList="$(echo ${SQNodeNames[@]} | tr ' ' ',')"
+   ExNodeList="-w $(echo ${SQNodeNames[@]} | sed -e 's/ / -w '/g)"
 
-   if [ ! -z ${ExNodeList[@]} ]; then   
-      echo "${ExNodeList[@]}"
+   if [[ -n "$ExNodeList" ]]; then   
+      echo "$ExNodeList"
    else
       echo
       echo "Could not parse $SQCONFIG_FILE file."
@@ -118,7 +122,7 @@
 cd $TRAF_HOME/sql/scripts
 
 # Check to make sure this is a real cluster configuration
-if  [[ -n "$node_count" ]] && [[ "$node_count" -gt "1" ]]  || [[ -n ${TRAF_AGENT} ]]; then
+if  [[ -n "$node_count" ]] && [[ "$node_count" -gt "1" ]]  ; then
     #
     # NOTE: Temporarily bypass the following check and always recreate the
     #       'sqconfig.db' file on the local node when invoked from an
@@ -155,8 +159,8 @@
 
     GetSQcnfg
 else
-    echo "node_count=${node_Count}"
-    echo
+    #echo "node_count=${node_Count}"
+    #echo
     echo "Workstation environment - Not a clustered environment"
     if [ -f $SQCONFIG_DB_FILE ]; then
         echo
@@ -173,12 +177,16 @@
     case "$flag" in
         -h)  Usage ;;
         -?)  Usage ;;
-        *)   SQCONFIG_FILE=$1 ;;
+        *)   SQCONFIG_FILE=$1
+             echo "SQCONFIG_FILE=$SQCONFIG_FILE"
+             shift
+             SQCONFIG_PERSIST_FILE=$1
+             echo "SQCONFIG_PERSIST_FILE=$SQCONFIG_PERSIST_FILE"
+             ;;
     esac
     shift
   done
 
-
 export SQLOG_DIR=$TRAF_LOG
 mkdir -p $TRAF_VAR
 mkdir -p $SQLOG_DIR
@@ -188,30 +196,29 @@
 mkdir -p $HOME/cbfs
 
 # Clean HBase classpath cache file
-echo "Clean up HBase classpath cache file: $TRAF_VAR/hbase_classpath"
+#echo "Clean up HBase classpath cache file: $TRAF_VAR/hbase_classpath"
 rm -rf $TRAF_VAR/hbase_classpath
 
 # Bypass if in agent mode
 if [[ -z ${TRAF_AGENT} ]]; then
-    #If pdsh exists
     if  [[ -n "$node_count" ]] && [[ "$node_count" -gt "1" ]]; then    
         echo
         echo "Creating directories on cluster nodes"
 
         # Clean HBase classpath cache file on all nodes
-        $PDSH -w ${ExNodeList[@]} -x `uname -n` $PDSH_SSH_CMD rm -rf $TRAF_VAR/hbase_classpath
+        $PDSH $ExNodeList rm -rf $TRAF_VAR/hbase_classpath
 
-        echo "$PDSH -w ${ExNodeList[@]} -x `uname -n` $PDSH_SSH_CMD mkdir -p $TRAF_VAR "
-        $PDSH -w ${ExNodeList[@]} -x `uname -n` $PDSH_SSH_CMD mkdir -p $TRAF_VAR
+        echo "$ExNodeList mkdir -p $TRAF_VAR "
+        $PDSH $ExNodeList mkdir -p $TRAF_VAR
         
-        echo "$PDSH -w ${ExNodeList[@]} -x `uname -n` $PDSH_SSH_CMD mkdir -p $SQLOG_DIR "
-        $PDSH -w ${ExNodeList[@]} -x `uname -n` $PDSH_SSH_CMD mkdir -p $SQLOG_DIR
+        echo "$PDSH $ExNodeList mkdir -p $SQLOG_DIR "
+        $PDSH $ExNodeList mkdir -p $SQLOG_DIR
     
-        echo "$PDSH -w ${ExNodeList[@]} -x `uname -n` $PDSH_SSH_CMD mkdir -p $MPI_TMPDIR "
-        $PDSH -w ${ExNodeList[@]} -x `uname -n` $PDSH_SSH_CMD mkdir -p $MPI_TMPDIR
+        echo "$PDSH $ExNodeList mkdir -p $MPI_TMPDIR "
+        $PDSH $ExNodeList mkdir -p $MPI_TMPDIR
     
-        echo "$PDSH -w ${ExNodeList[@]} -x `uname -n` $PDSH_SSH_CMD mkdir -p $MPI_TMPDIR/tmp "
-        $PDSH -w ${ExNodeList[@]} -x `uname -n` $PDSH_SSH_CMD mkdir -p $MPI_TMPDIR/tmp
+        echo "$PDSH $ExNodeList mkdir -p $MPI_TMPDIR/tmp "
+        $PDSH $ExNodeList mkdir -p $MPI_TMPDIR/tmp
     
     fi
 fi
@@ -219,13 +226,13 @@
 SQSCRIPT_FILE=./gomon
 SQESPENV_FILE=$TRAF_VAR/tdm_arkesp.env
 
-echo
+#echo
 if [ -f $TRAF_VAR/ms.env ]; then
-    echo "The SQ environment variable file $TRAF_VAR/ms.env exists."
+    echo "The environment variable file $TRAF_VAR/ms.env exists."
     echo "The file will not be re-generated."
     echo
 else
-    echo "Generating SQ environment variable file: $TRAF_VAR/ms.env"
+    echo "Generating environment variable file: $TRAF_VAR/ms.env"
     echo
     ./genms > $TRAF_VAR/ms.env
     lv_retcode=$?
@@ -233,6 +240,10 @@
         echo "Error $lv_retcode while executing genms. Exiting..."
         exit $lv_retcode
     fi
+    if [[ -f $TRAF_CONF/ms.env.add ]]
+    then
+      cat $TRAF_CONF/ms.env.add >> $TRAF_VAR/ms.env
+    fi
 fi
 
 ./gensqstatem2lenv > $TRAF_VAR/sqstatem2l.env
@@ -242,7 +253,7 @@
 .quit
 eof
 
-./gensq.pl $SQSCRIPT_FILE `hostname` $FT_FLAG $PERF_FLAG $SQCONFIG_FILE $TRAF_HOME/sql/scripts/sqconfig.persist
+./gensq.pl $SQSCRIPT_FILE `hostname` $FT_FLAG $PERF_FLAG $SQCONFIG_FILE $SQCONFIG_PERSIST_FILE
 sq_stat=$?
 if [[ $sq_stat != 0 ]]; then 
     exit $sq_stat;
@@ -278,32 +289,37 @@
     if  [[ -n "$node_count" ]] && [[ "$node_count" -gt "1" ]]; then    
         echo
         echo
-        echo "Copying the generated files to all the nodes in the cluster"
+        echo "Copying the configuration and generated files to all the nodes in the cluster"
         echo
-        echo "Copying $TRAF_VAR/ms.env to $TRAF_VAR of all the nodes"
-        echo "$PDCP -w ${ExNodeList[@]} -x `uname -n` $TRAF_VAR/ms.env   $TRAF_VAR "
-        $PDCP -w ${ExNodeList[@]} -x `uname -n` $TRAF_VAR/ms.env   $TRAF_VAR
+
+        echo "Copying $TRAF_VAR/ms.env to $TRAF_VAR to all the nodes"
+        echo "$PDCP $ExNodeList $TRAF_VAR/ms.env   $TRAF_VAR "
+        $PDCP $ExNodeList $TRAF_VAR/ms.env   $TRAF_VAR
+
+        echo "Copying $SQCONFIG_FILE and $SQCONFIG_PERSIST_FILE to $TRAF_CONF to all the nodes"
+        echo "$PDCP $ExNodeList $SQCONFIG_FILE $SQCONFIG_PERSIST_FILE    $TRAF_CONF "
+        $PDCP $ExNodeList $SQCONFIG_FILE $SQCONFIG_PERSIST_FILE    $TRAF_CONF
 
 
         echo
-        echo "Copying $TRAF_VAR/seamonster.env to $TRAF_VAR of all the nodes"
-        echo "$PDCP -w ${ExNodeList[@]} -x `uname -n` $TRAF_VAR/seamonster.env   $TRAF_VAR "
-        $PDCP -w ${ExNodeList[@]} -x `uname -n` $TRAF_VAR/seamonster.env   $TRAF_VAR
+        echo "Copying $TRAF_VAR/seamonster.env to $TRAF_VAR to all the nodes"
+        echo "$PDCP $ExNodeList $TRAF_VAR/seamonster.env   $TRAF_VAR "
+        $PDCP $ExNodeList $TRAF_VAR/seamonster.env   $TRAF_VAR
 
         if [[ $sq_seamonster == 1 ]]; then 
             if [ -f $SQESPENV_FILE ]; then
                 echo
-                echo "Copying $SQESPENV_FILE to $TRAF_VAR of all the nodes"
-                echo "$PDCP -w ${ExNodeList[@]} -x `uname -n` $SQESPENV_FILE   $TRAF_VAR "
-                $PDCP -w ${ExNodeList[@]} -x `uname -n` $SQESPENV_FILE   $TRAF_VAR
+                echo "Copying $SQESPENV_FILE to $TRAF_VAR to all the nodes"
+                echo "$PDCP $ExNodeList $SQESPENV_FILE   $TRAF_VAR "
+                $PDCP $ExNodeList $SQESPENV_FILE   $TRAF_VAR
             fi
         fi
 
         echo
-        echo "Copying Trafodion Configuration files to $TRAF_VAR"
+        echo "Copying Trafodion Configuration database file to $TRAF_VAR"
 
-        echo "$PDCP -w ${ExNodeList[@]} -x `uname -n` $SQCONFIG_FILE $SQCONFIG_DB_FILE $TRAF_VAR/ "
-        $PDCP -w ${ExNodeList[@]} -x `uname -n` $SQCONFIG_FILE $SQCONFIG_DB_FILE $TRAF_VAR/
+        echo "$PDCP $ExNodeList $SQCONFIG_DB_FILE $TRAF_VAR/ "
+        $PDCP $ExNodeList $SQCONFIG_DB_FILE $TRAF_VAR/
 
         echo
     fi
@@ -317,7 +333,6 @@
 echo
 if (test -f $TRAF_HOME/sql/scripts/sqcertgen); then
   $TRAF_HOME/sql/scripts/sqcertgen 2>/dev/null
-  echo
 else
   echo
   echo "ERROR: Certificate generation script (sqcertgen) does not exist in $TRAF_HOME/sql/scripts folder"
diff --git a/core/sqf/sql/scripts/tmstart b/core/sqf/sql/scripts/tmstart
index 2ca0eac..8c04d6f 100755
--- a/core/sqf/sql/scripts/tmstart
+++ b/core/sqf/sql/scripts/tmstart
@@ -22,6 +22,15 @@
 # @@@ END COPYRIGHT @@@
 #
 
+# Check if there are TM processes already running on the cluster.
+# If they are, then it means that this script was executed before to
+# start the TMs. 
+lv_num_tm_processes=$(sqshell -c persist info DTM | grep DTM | wc -l)
+echo "Number of TM processes on the instance: ${lv_num_tm_processes}"
+if ( [ ${lv_num_tm_processes} '>' 0 ] ); then
+    sqshell -c set SQ_TXNSVC_READY=1
+fi
+
 sqshell -a <<eof
 
 ! Start DTM
@@ -29,7 +38,6 @@
 set SQ_AUDITSVC_READY=1
 set DTM_TLOG_PER_TM=1
 set TRAF_TM_LOCKED=0
-persist exec DTM
 delay 5
 exit
 eof
diff --git a/core/sqf/src/seabed/src/Makefile b/core/sqf/src/seabed/src/Makefile
index 3d77af6..79215e1 100644
--- a/core/sqf/src/seabed/src/Makefile
+++ b/core/sqf/src/seabed/src/Makefile
@@ -53,6 +53,7 @@
 LIBSBSAUTIL	= $(LIBEXPDIR)/libsbsautil.so
 LIBSBUTIL	= $(LIBEXPDIR)/libsbutil.so
 LIBSQSTATESB	= $(LIBEXPDIR)/libsqstatesb.so
+
 # neet -lrt for clock_gettime
 LIBSX	       += -lrt
 #
@@ -134,6 +135,7 @@
 		  $(OUTDIR)/mslabelmaps.o \
 		  $(OUTDIR)/mstracevars.o \
 		  $(OUTDIR)/msvars.o \
+		  $(OUTDIR)/slotmgr.o \
 		  $(OUTDIR)/otrace.o \
 		  $(OUTDIR)/thread.o \
 		  $(OUTDIR)/threadl.o \
@@ -176,7 +178,6 @@
 $(LIBEXPDIR)/libsbms.so: $(OBJSBMS) $(LIBSBUTIL)
 	$(CXX) $(LIBSX) $(LNK_FLGS) -shared -o $@ $(OBJSBMS) $(LIBSBUTILX) $(LIBSMX) -ldl
 
-
 $(LIBEXPDIR)/libsbsautil.so: $(OBJSBSAUTIL)
 	$(OCXX) $(LNK_FLGS) -shared -o $@ $(OBJSBSAUTIL) -lpthread
 
diff --git a/core/sqf/src/seabed/src/apictr.h b/core/sqf/src/seabed/src/apictr.h
index e0f477b..707240f 100644
--- a/core/sqf/src/seabed/src/apictr.h
+++ b/core/sqf/src/seabed/src/apictr.h
@@ -155,6 +155,7 @@
         SB_ACTR_MSG_MON_GET_TRANS_INFO_TRANSID,
         SB_ACTR_MSG_MON_GET_ZONE_INFO,
         SB_ACTR_MSG_MON_GET_ZONE_INFO_DETAIL,
+        SB_ACTR_MSG_MON_GET_INSTANCE_ID,
         SB_ACTR_MSG_MON_MOUNT_DEVICE,
         SB_ACTR_MSG_MON_NODE_DOWN,
         SB_ACTR_MSG_MON_NODE_DOWN2,
diff --git a/core/sqf/src/seabed/src/apictr.sh b/core/sqf/src/seabed/src/apictr.sh
index 0602d9d..8fd5410 100644
--- a/core/sqf/src/seabed/src/apictr.sh
+++ b/core/sqf/src/seabed/src/apictr.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 #
 # @@@ START COPYRIGHT @@@
 #
diff --git a/core/sqf/src/seabed/src/env.cpp b/core/sqf/src/seabed/src/env.cpp
index a7ed909..d002ef7 100644
--- a/core/sqf/src/seabed/src/env.cpp
+++ b/core/sqf/src/seabed/src/env.cpp
@@ -60,7 +60,7 @@
 const char *gp_ms_env_max_cap_mds        = "MS_MAX_CAP_MDS";
 const char *gp_ms_env_max_cap_ods        = "MS_MAX_CAP_ODS";
 const char *gp_ms_env_max_cap_phandles   = "MS_MAX_CAP_PHANDLES";
-const char *gp_ms_env_mpi_tmpdir         = "MPI_TMPDIR";
+const char *gp_ms_env_mpi_tmpdir         = "TRAF_LOG";
 const char *gp_ms_env_msg_timestamp      = "MS_MSG_TIMESTAMP";
 const char *gp_ms_env_sb_api_sig         = "SB_API_SIG";
 const char *gp_ms_env_shutdown_fast      = "MS_SHUTDOWN_FAST";
diff --git a/core/sqf/src/seabed/src/fs.cpp b/core/sqf/src/seabed/src/fs.cpp
index 88e1290..c7ab385 100644
--- a/core/sqf/src/seabed/src/fs.cpp
+++ b/core/sqf/src/seabed/src/fs.cpp
@@ -415,9 +415,9 @@
 //
 // Purpose: handle process startup
 //
-SB_Export int file_mon_process_startup2(int pv_sysmsgs, int pv_pipeio)
+SB_Export int file_mon_process_startup2(int pv_sysmsgs, int pv_pipeio, bool pv_remap_std_err)
 SB_THROWS_FATAL {
-    return msg_mon_process_startup3(pv_sysmsgs, pv_pipeio);
+  return msg_mon_process_startup3(pv_sysmsgs, pv_pipeio, pv_remap_std_err);
 }
 
 //
diff --git a/core/sqf/src/seabed/src/labelmaps.cpp b/core/sqf/src/seabed/src/labelmaps.cpp
index 4e1dfdd..ef7ff22 100644
--- a/core/sqf/src/seabed/src/labelmaps.cpp
+++ b/core/sqf/src/seabed/src/labelmaps.cpp
@@ -227,7 +227,6 @@
     "MS_MsgType_NodeDeleted",
     "MS_MsgType_NodeDown",
     "MS_MsgType_NodeJoining",
-    "MS_MsgType_NodePrepare",
     "MS_MsgType_NodeQuiesce",
     "MS_MsgType_NodeUp",
     "MS_MsgType_Open",
@@ -237,10 +236,6 @@
     "MS_MsgType_Service",
     "MS_MsgType_Shutdown",
     "MS_MsgType_SpareUp",
-    "MS_MsgType_TmRestarted",
-    "MS_MsgType_TmSyncAbort",
-    "MS_MsgType_TmSyncCommit",
-    "MS_MsgType_UnsolicitedMessage",
     SB_LABEL_END
 };
 
@@ -253,7 +248,6 @@
     "NodeDeleted",
     "NodeDown",
     "NodeJoining",
-    "NodePrepare",
     "NodeQuiesce",
     "NodeUp",
     "Open",
@@ -263,10 +257,6 @@
     "Service",
     "Shutdown",
     "SpareUp",
-    "TmRestarted",
-    "TmSyncAbort",
-    "TmSyncCommit",
-    "UnsolicitedMessage",
     SB_LABEL_END
 };
 
@@ -349,6 +339,7 @@
     "MS_ReqType_Event",
     "MS_ReqType_Exit",
     "MS_ReqType_Get",
+    "MS_ReqType_InstanceId",
     "MS_ReqType_Kill",
     "MS_ReqType_MonStats",
     "MS_ReqType_Mount",
@@ -378,11 +369,8 @@
     "MS_ReqType_Shutdown",
     "MS_ReqType_ShutdownNs",
     "MS_ReqType_Startup",
-    "MS_ReqType_Stfsd",
     "MS_ReqType_TmLeader",
     "MS_ReqType_TmReady",
-    "MS_ReqType_TmSync",
-    "MS_ReqType_TransInfo",
     "MS_ReqType_ZoneInfo",
     SB_LABEL_END
 };
@@ -394,6 +382,7 @@
     "Event",
     "Exit",
     "Get",
+    "InstanceId",
     "Kill",
     "MonStats",
     "Mount",
@@ -423,11 +412,8 @@
     "Shutdown",
     "ShutdownNs",
     "Startup",
-    "Stfsd",
     "TmLeader",
     "TmReady",
-    "TmSync",
-    "TransInfo",
     "ZoneInfo",
     SB_LABEL_END
 };
@@ -478,6 +464,7 @@
     "DeleteNs",
     "Dump",
     "Get",
+    "InstanceId",
     "MonStats",
     "Mount",
     "NewProcess",
@@ -489,10 +476,7 @@
     "PNodeInfo",
     "ProcessInfo",
     "ProcessInfoNs",
-    "Stfsd",
     "Startup",
-    "TmSync",
-    "TransInfo",
     "ZoneInfo",
     SB_LABEL_END
 };
@@ -564,7 +548,6 @@
     "MsgType_NodeDeleted",
     "MsgType_NodeDown",
     "MsgType_NodeJoining",
-    "MsgType_NodePrepare",
     "MsgType_NodeQuiesce",
     "MsgType_NodeUp",
     "MsgType_Open",
@@ -574,10 +557,6 @@
     "MsgType_Service",
     "MsgType_Shutdown",
     "MsgType_SpareUp",
-    "MsgType_TmRestarted",
-    "MsgType_TmSyncAbort",
-    "MsgType_TmSyncCommit",
-    "MsgType_UnsolicitedMessage",
     "MsgType_Invalid",
     SB_LABEL_END
 };
@@ -589,6 +568,7 @@
     "ReqType_Event",
     "ReqType_Exit",
     "ReqType_Get",
+    "ReqType_InstanceId",
     "ReqType_Kill",
     "ReqType_MonStats",
     "ReqType_Mount",
@@ -618,11 +598,8 @@
     "ReqType_Shutdown",
     "ReqType_ShutdownNs",
     "ReqType_Startup",
-    "ReqType_Stfsd",
     "ReqType_TmLeader",
     "ReqType_TmReady",
-    "ReqType_TmSync",
-    "ReqType_TransInfo",
     "ReqType_ZoneInfo",
     SB_LABEL_END
 };
@@ -715,6 +692,7 @@
     "MSG_MON_GET_TRANS_INFO_TRANSID",
     "MSG_MON_GET_ZONE_INFO",
     "MSG_MON_GET_ZONE_INFO_DETAIL",
+    "MSG_MON_GET_INSTANCE_ID",
     "MSG_MON_MOUNT_DEVICE,",
     "MSG_MON_MOUNT_DEVICE2",
     "MSG_MON_NODE_DOWN",
@@ -855,7 +833,7 @@
 
 enum {
     MS_LABEL_LIMIT_MON_MSG_TYPE_LO = MS_MsgType_Change,
-    MS_LABEL_LIMIT_MON_MSG_TYPE_HI = MS_MsgType_UnsolicitedMessage
+    MS_LABEL_LIMIT_MON_MSG_TYPE_HI = MsgType_SpareUp
 };
 SB_Label_Map gv_ms_mon_msg_type_label_map = {
     MS_LABEL_LIMIT_MON_MSG_TYPE_LO,
diff --git a/core/sqf/src/seabed/src/mpitmsg.cpp b/core/sqf/src/seabed/src/mpitmsg.cpp
index 03f5e0c..044647a 100644
--- a/core/sqf/src/seabed/src/mpitmsg.cpp
+++ b/core/sqf/src/seabed/src/mpitmsg.cpp
@@ -43,19 +43,12 @@
 //
 // statics
 //
-SB_Trans::Md_Table_Entry_Mgr SB_Trans::Msg_Mgr::cv_md_table_entry_mgr;
-SB_Trans::Md_Table_Mgr       SB_Trans::Msg_Mgr::cv_md_table("tablemgr-MD",
-                                                            SB_Table_Mgr_Alloc::ALLOC_FIFO,
-                                                            SB_Table_Mgr_Alloc::ALLOC_ENTRY_BLOCK,
-                                                            &cv_md_table_entry_mgr,
-                                                            4096, 1024); // cap-init, cap-inc
 SB_Atomic_Int                SB_Trans::Msg_Mgr::cv_md_table_count_recv;
 SB_Atomic_Int                SB_Trans::Msg_Mgr::cv_md_table_count_send;
 SB_Atomic_Int                SB_Trans::Msg_Mgr::cv_md_table_count_total;
 int                          SB_Trans::Msg_Mgr::cv_md_table_hi_recv = 0;
 int                          SB_Trans::Msg_Mgr::cv_md_table_hi_send = 0;
 int                          SB_Trans::Msg_Mgr::cv_md_table_hi_total = 0;
-int                          SB_Trans::Msg_Mgr::cv_md_table_inx = SB_Trans::Msg_Mgr::init();
 int                          SB_Trans::Msg_Mgr::cv_md_table_max_recv = 255;
 int                          SB_Trans::Msg_Mgr::cv_md_table_max_send = 1023;
 
diff --git a/core/sqf/src/seabed/src/ms.cpp b/core/sqf/src/seabed/src/ms.cpp
index 43ce18c..9c344ab 100644
--- a/core/sqf/src/seabed/src/ms.cpp
+++ b/core/sqf/src/seabed/src/ms.cpp
@@ -182,6 +182,14 @@
 void __attribute__((constructor)) __msg_init(void);
 void __attribute__((destructor))  __msg_fini(void);
 
+SB_Trans::Md_Table_Entry_Mgr SB_Trans::Msg_Mgr::cv_md_table_entry_mgr;
+SB_Trans::Md_Table_Mgr       SB_Trans::Msg_Mgr::cv_md_table("tablemgr-MD",
+                                                            SB_Table_Mgr_Alloc::ALLOC_FIFO,
+                                                            SB_Table_Mgr_Alloc::ALLOC_ENTRY_BLOCK,
+                                                            &cv_md_table_entry_mgr,
+                                                            4096, 1024); // cap-init, cap-inc
+int                          SB_Trans::Msg_Mgr::cv_md_table_inx = SB_Trans::Msg_Mgr::init();
+
 //
 // forwards
 //
@@ -228,7 +236,7 @@
 //
 // Purpose:
 //
-void ms_fifo_setup(int pv_orig_fd, char *pp_fifo_name) {
+void ms_fifo_setup(int pv_orig_fd, char *pp_fifo_name, bool pv_remap_fd) {
     const char *WHERE = "ms_fifo_setup";
     int         lv_err;
     int         lv_fifo_fd;
@@ -244,26 +252,28 @@
             trace_where_printf("fifo open error, fifo=%s, errno=%d\n",
                                pp_fifo_name, errno);
     } else {
-        // Remap fifo file descriptor
-        // Close unneeded fifo file descriptor.
-        lv_err = close(pv_orig_fd);
-        if (lv_err == -1) {
-            if (gv_ms_trace)
-                trace_where_printf(WHERE, "fifo original close error, fd=%d, errno=%d\n",
-                                   pv_orig_fd, errno);
-        }
-
-        lv_err = dup2(lv_fifo_fd, pv_orig_fd);
-        if (lv_err == -1) {
-            if (gv_ms_trace)
-                trace_where_printf(WHERE, "fifo dup2 error, old-fd=%d, new-fd=%d, errno=%d\n",
-                                   lv_fifo_fd, pv_orig_fd, errno);
-        } else {
-            lv_err = close(lv_fifo_fd);
+        if (pv_remap_fd) {
+            // Remap fifo file descriptor
+            // Close unneeded fifo file descriptor.
+            lv_err = close(pv_orig_fd);
             if (lv_err == -1) {
                 if (gv_ms_trace)
-                    trace_where_printf(WHERE, "fifo close error, fifo-fd=%d, errno=%d\n",
-                                       lv_fifo_fd, errno);
+                    trace_where_printf(WHERE, "fifo original close error, fd=%d, errno=%d\n",
+                                       pv_orig_fd, errno);
+            }
+
+            lv_err = dup2(lv_fifo_fd, pv_orig_fd);
+            if (lv_err == -1) {
+                if (gv_ms_trace)
+                    trace_where_printf(WHERE, "fifo dup2 error, old-fd=%d, new-fd=%d, errno=%d\n",
+                                       lv_fifo_fd, pv_orig_fd, errno);
+            } else {
+                lv_err = close(lv_fifo_fd);
+                if (lv_err == -1) {
+                    if (gv_ms_trace)
+                        trace_where_printf(WHERE, "fifo close error, fifo-fd=%d, errno=%d\n",
+                                           lv_fifo_fd, errno);
+                }
             }
         }
     }
@@ -806,7 +816,7 @@
 
     lv_len = static_cast<int>(strlen(pp_arg));
 
-    if ((lv_len != 5) && (lv_len != 6))
+    if ((lv_len < 5) || (lv_len > 8))
         return false;
     for (lv_inx = 0; lv_inx < lv_len; lv_inx++)
         if (!isdigit(pp_arg[lv_inx]))
@@ -1340,8 +1350,6 @@
         ms_gather_info(WHERE);
     }
 
-    if (ms_getenv_str(gp_ms_env_assert_chk) == NULL)
-        gv_ms_assert_chk = true;
     if (gv_ms_trace_enable)
         trace_where_printf(WHERE, "TCP set, MS_ASSERT_CHK=%d\n",
                            gv_ms_assert_chk);
diff --git a/core/sqf/src/seabed/src/msmon.cpp b/core/sqf/src/seabed/src/msmon.cpp
index 17628f6..f7be644 100644
--- a/core/sqf/src/seabed/src/msmon.cpp
+++ b/core/sqf/src/seabed/src/msmon.cpp
@@ -242,7 +242,6 @@
 static int                    gv_ms_trace_callback_inx = 0;
 static MS_Mon_Tmlib_Cb_Type   gv_ms_tmlib_callback      = NULL;
 static MS_Mon_Tmlib2_Cb_Type  gv_ms_tmlib2_callback     = NULL;
-static MS_Mon_TmSync_Cb_Type  gv_ms_tmsync_callback     = NULL;
 SB_Ts_Lmap                    Ms_Open_Thread::cv_run_map("map-ms-open-thread-run");
 
 typedef struct Map_Tag_Entry_Type {
@@ -338,9 +337,12 @@
                                                   bool pv_attach,
                                                   bool pv_eventmsgs,
                                                   bool pv_pipeio,
-                                                  bool pv_altsig)
+                                                  bool pv_altsig,
+                                                  bool pv_stderr_remap=true)
 SB_THROWS_FATAL;
-static int            msg_mon_process_startup_ph1(bool pv_attach, bool pv_altsig)
+static int            msg_mon_process_startup_ph1(bool pv_attach, 
+                                                  bool pv_altsig, 
+                                                  bool pv_stderr_remap=true)
 SB_THROWS_FATAL;
 static void           msg_mon_recv_msg_cbt(MS_Md_Type *pp_md);
 static void           msg_mon_recv_msg_cbt_discard(const char *pp_where,
@@ -364,13 +366,9 @@
 static void           msg_mon_recv_msg_process_created(Mon_Msg_Type *pp_msg);
 static void           msg_mon_recv_msg_process_death(Mon_Msg_Type *pp_msg);
 static void           msg_mon_recv_msg_shutdown(Mon_Msg_Type *pp_msg);
-static void           msg_mon_recv_msg_tmsync_abort(Mon_Msg_Type *pp_msg);
-static void           msg_mon_recv_msg_tmsync_commit(Mon_Msg_Type *pp_msg);
 static void           msg_mon_recv_msg_unknown(Mon_Msg_Type *pp_msg);
 static void           msg_mon_recv_notice_msg_loc_cbt(Mon_Msg_Type *pp_msg,
                                                       int           pv_size);
-static void           msg_mon_recv_unsol_msg_loc_cbt(Mon_Msg_Type *pp_msg,
-                                                     int           pv_size);
 static int            msg_mon_send_node_info(const char   *pp_where,
                                              Mon_Msg_Type *pp_msg,
                                              int           pv_msg_err,
@@ -455,8 +453,6 @@
                                                   Mon_Msg_Type *pp_msg);
 static void           msg_mon_trace_msg_node_quiesce(const char   *pp_where,
                                                      Mon_Msg_Type *pp_msg);
-static void           msg_mon_trace_msg_node_prepare(const char   *pp_where,
-                                                     Mon_Msg_Type *pp_msg);
 static void           msg_mon_trace_msg_node_up(const char   *pp_where,
                                                 Mon_Msg_Type *pp_msg);
 static void           msg_mon_trace_msg_open(const char   *pp_where,
@@ -467,8 +463,6 @@
                                                       Mon_Msg_Type *pp_msg);
 static void           msg_mon_trace_msg_shutdown(const char   *pp_where,
                                                  Mon_Msg_Type *pp_msg);
-static void           msg_mon_trace_msg_tmsync(const char   *pp_where,
-                                               Mon_Msg_Type *pp_msg);
 static void           msg_mon_trace_msg_unknown(const char   *pp_where,
                                                 Mon_Msg_Type *pp_msg);
 static void           ms_fs_shutdown_ph1();
@@ -1564,7 +1558,7 @@
     if (pp_path == NULL) {
         lp_path = getenv(gp_ms_env_sq_snapshot_dir);
         if (lp_path == NULL)
-            lp_path = getenv("PWD");
+            lp_path = getenv("TRAF_LOG");
     } else
         lp_path = pp_path;
     // make absolute path and check len - don't need ms_util_string_copy
@@ -1574,7 +1568,7 @@
             return ms_err_rtn_msg(pp_where, "EXIT", XZFIL_ERR_BOUNDSERR);
         strcpy(lp_msg->u.request.u.dump.path, lp_path);
     } else {
-        lp_cwd = getenv("PWD");
+        lp_cwd = getenv("TRAF_LOG");
         lv_len = strlen(lp_cwd) + 1 + strlen(lp_path);
         if (lv_len > sizeof(lp_msg->u.request.u.dump.path))
             return ms_err_rtn_msg(pp_where, "EXIT", XZFIL_ERR_BOUNDSERR);
@@ -2055,6 +2049,62 @@
 }
 
 //
+// Purpose: get cluster id and instance id
+//
+SB_Export int msg_mon_get_instance_id(int *pp_cluster_id,
+                                      int *pp_instance_id) {
+    const char   *WHERE = "msg_mon_get_instance_id";
+    Mon_Msg_Type *lp_msg;
+    int           lv_mpierr;
+    SB_API_CTR   (lv_zctr, MSG_MON_GET_INSTANCE_ID);
+
+    SB_UTRACE_API_ADD2(SB_UTRACE_API_OP_MSG_MON_GET_INSTANCE_ID, 0);
+
+    if (gv_ms_trace_mon)
+        trace_where_printf(WHERE, "ENTER nid=%d, pid=%d\n",
+                           gv_ms_su_nid, gv_ms_su_pid);
+    if (!gv_ms_mon_calls_ok) // msg_mon_get_zone_info_detail
+        return ms_err_rtn_msg(WHERE, "msg_init() or startup not called or shutdown",
+                              XZFIL_ERR_INVALIDSTATE);
+
+    Mon_Msg_Auto lv_msg;
+    lp_msg = &lv_msg;
+    lp_msg->type = MsgType_Service;
+    lp_msg->noreply = false;
+    lp_msg->u.request.type = ReqType_InstanceId;
+    lp_msg->u.request.u.instance_id.nid = gv_ms_su_nid;
+    lp_msg->u.request.u.instance_id.pid = gv_ms_su_pid;
+    lp_msg->u.request.u.instance_id.verifier = gv_ms_su_verif;
+    if (gv_ms_trace_mon)
+        trace_where_printf(WHERE, "send instance-id req to mon, nid=%d, pid=%d\n",
+                           gv_ms_su_nid, gv_ms_su_pid);
+    lv_mpierr = msg_mon_sendrecv_mon(WHERE,
+                                     "instance-id",
+                                     lp_msg,
+                                     lv_msg.get_error());
+    if (msg_mon_msg_ok(WHERE,
+                       "instance-id req",
+                       &lv_mpierr,
+                       lp_msg,
+                       MsgType_Service,
+                       ReplyType_InstanceId)) {
+        struct InstanceId_reply_def *lp_instance = &lp_msg->u.reply.u.instance_id;
+        // copy results for user
+        *pp_cluster_id = lp_instance->cluster_id;
+        *pp_instance_id = lp_instance->instance_id;
+        if (gv_ms_trace_mon) {
+            if (lv_mpierr == MPI_SUCCESS) {
+                trace_where_printf(WHERE, "EXIT OK instance-id req, cluster_id=%d, instance_id=%d\n",
+                                   lp_instance->cluster_id, lp_instance->instance_id);
+            } else
+                trace_where_printf(WHERE, "EXIT FAILED instance-id req, ret=%d\n",
+                                   lv_mpierr);
+        }
+    }
+    return ms_err_mpi_rtn_msg(WHERE, "EXIT", lv_mpierr);
+}
+
+//
 // Purpose: get monitor stats
 //
 SB_Export int msg_mon_get_monitor_stats(MS_Mon_Monitor_Stats_Type *pp_stats) {
@@ -3273,129 +3323,6 @@
     return lv_ref_count;
 }
 
-//
-// Purpose: get trans info
-//
-static int msg_mon_get_trans_info_com(const char             *pp_where,
-                                      char                   *pp_name,
-                                      MS_Mon_Transid_Type     pv_transid,
-                                      MS_Mon_Trans_Info_Type *pp_info) {
-    Mon_Msg_Type *lp_msg;
-    int           lv_mpierr;
-
-    if (pp_info == NULL)
-        return ms_err_rtn_msg(pp_where,
-                              "invalid info (null)",
-                              XZFIL_ERR_BOUNDSERR);
-    if (!gv_ms_mon_calls_ok) // msg_mon_get_trans_info
-        return ms_err_rtn_msg(pp_where, "msg_init() or startup not called or shutdown",
-                              XZFIL_ERR_INVALIDSTATE);
-
-    Mon_Msg_Auto lv_msg;
-    lp_msg = &lv_msg;
-    lp_msg->type = MsgType_Service;
-    lp_msg->noreply = false;
-    lp_msg->u.request.type = ReqType_TransInfo;
-    lp_msg->u.request.u.trans_info.nid = gv_ms_su_nid;
-    lp_msg->u.request.u.trans_info.pid = gv_ms_su_pid;
-    if (pp_name == NULL)
-        lp_msg->u.request.u.trans_info.process_name[0] = '\0';
-    else
-        ms_util_string_copy(lp_msg->u.request.u.trans_info.process_name,
-                            sizeof(lp_msg->u.request.u.trans_info.process_name),
-                            pp_name);
-    TRANSID_COPY_MON_TO(lp_msg->u.request.u.trans_info.trans_id, pv_transid);
-
-    if (gv_ms_trace_mon)
-        trace_where_printf(pp_where, "send trans-info req to mon\n");
-    lv_mpierr = msg_mon_sendrecv_mon(pp_where,
-                                     "trans-info",
-                                     lp_msg,
-                                     lv_msg.get_error());
-    if (msg_mon_msg_ok(pp_where,
-                       "trans-info req",
-                       &lv_mpierr,
-                       lp_msg,
-                       MsgType_Service,
-                       ReplyType_TransInfo)) {
-        lv_mpierr = lp_msg->u.reply.u.trans_info.return_code;
-        if ((lv_mpierr == MPI_SUCCESS) || (lv_mpierr == MPI_ERR_TRUNCATE)) {
-            if (gv_ms_trace_mon) {
-                int lv_num_procs = lp_msg->u.reply.u.trans_info.num_processes;
-                trace_where_printf(pp_where, "trans-info req OK, num_procs=%d, ret=%d\n",
-                                   lv_num_procs, lv_mpierr);
-                for (int lv_proc = 0; lv_proc < lv_num_procs; lv_proc++) {
-                    char la_transid[100];
-                    MS_Mon_Transid_Type lv_transid_copy;
-                    TRANSID_COPY_MON_FROM(lv_transid_copy, lp_msg->u.reply.u.trans_info.procs[lv_proc].trans_id);
-                    msg_util_format_transid(la_transid, lv_transid_copy);
-                    trace_where_printf(pp_where, "proc[%d].p-id=%d/%d, transid=%s\n",
-                                       lv_proc,
-                                       lp_msg->u.reply.u.trans_info.procs[lv_proc].nid,
-                                       lp_msg->u.reply.u.trans_info.procs[lv_proc].pid,
-                                       la_transid);
-                }
-            }
-
-            // copy results for user
-            memcpy(pp_info,
-                   &lp_msg->u.reply.u.trans_info,
-                   sizeof(MS_Mon_Trans_Info_Type));
-            pp_info->truncated = (lv_mpierr == MPI_SUCCESS) ? 0 : -1;
-            lv_mpierr = MPI_SUCCESS;
-        } else {
-            if (gv_ms_trace_mon)
-                trace_where_printf(pp_where, "EXIT FAILURE trans-info req, ret=%d\n",
-                                   lv_mpierr);
-        }
-    }
-    return ms_err_mpi_rtn_msg(pp_where, "EXIT", lv_mpierr);
-}
-
-//
-// Purpose: get trans info for process
-//
-SB_Export int msg_mon_get_trans_info_process(char                   *pp_name,
-                                             MS_Mon_Trans_Info_Type *pp_info) {
-    const char *WHERE = "msg_mon_get_trans_info_process";
-    SB_API_CTR (lv_zctr, MSG_MON_GET_TRANS_INFO_PROCESS);
-
-    SB_UTRACE_API_ADD2(SB_UTRACE_API_OP_MSG_MON_GET_TRANS_INFO_PROCESS, 0);
-    if (gv_ms_trace_mon)
-        trace_where_printf(WHERE, "ENTER pname=%s, info=%p\n",
-                           pp_name, pfp(pp_info));
-    if (pp_name == NULL)
-        return ms_err_rtn_msg(WHERE,
-                              "invalid process name (null)",
-                              XZFIL_ERR_BOUNDSERR);
-    MS_Mon_Transid_Type lv_transid;
-    TRANSID_SET_INVALID(lv_transid);
-    return msg_mon_get_trans_info_com(WHERE, pp_name, lv_transid, pp_info);
-}
-
-//
-// Purpose: get trans info for transid
-//
-SB_Export int msg_mon_get_trans_info_transid(MS_Mon_Transid_Type     pv_transid,
-                                             MS_Mon_Trans_Info_Type *pp_info) {
-    const char *WHERE = "msg_mon_get_trans_info_transid";
-    SB_API_CTR (lv_zctr, MSG_MON_GET_TRANS_INFO_TRANSID);
-
-    SB_UTRACE_API_ADD2(SB_UTRACE_API_OP_MSG_MON_GET_TRANS_INFO_TRANSID, 0);
-    if (gv_ms_trace_mon) {
-        char la_transid[100];
-        msg_util_format_transid(la_transid, pv_transid);
-        trace_where_printf(WHERE, "ENTER transid=%s, info=%p\n",
-                           la_transid, pfp(pp_info));
-    }
-    if (TRANSID_IS_INVALID(pv_transid))
-        return ms_err_rtn_msg(WHERE,
-                              "invalid transid (-1)",
-                              XZFIL_ERR_BOUNDSERR);
-
-    return msg_mon_get_trans_info_com(WHERE, NULL, pv_transid, pp_info);
-}
-
 
 //
 // Purpose: get zone info (all nodes)
@@ -3629,8 +3556,6 @@
 
     SB_util_static_assert(static_cast<int>(MS_MsgType_Change) ==
                           static_cast<int>(MsgType_Change)); // sw fault
-    SB_util_static_assert(static_cast<int>(MS_MsgType_UnsolicitedMessage) ==
-                          static_cast<int>(MsgType_UnsolicitedMessage)); // sw fault
 
     SB_util_static_assert(static_cast<int>(MS_ReqType_Close) ==
                           static_cast<int>(ReqType_Close)); // sw fault
@@ -3659,8 +3584,6 @@
                           sizeof(NodeDown_def)); // sw fault
     SB_util_static_assert(sizeof(MS_Mon_NodeJoining_def) ==
                           sizeof(NodeJoining_def)); // sw fault
-    SB_util_static_assert(sizeof(MS_Mon_NodePrepare_def) ==
-                          sizeof(NodePrepare_def)); // sw fault
     SB_util_static_assert(sizeof(MS_Mon_NodeQuiesce_def) ==
                           sizeof(NodeQuiesce_def)); // sw fault
     SB_util_static_assert(sizeof(MS_Mon_NodeUp_def) ==
@@ -3673,8 +3596,6 @@
                           sizeof(Shutdown_def)); // sw fault
     SB_util_static_assert(sizeof(MS_Mon_SpareUp_def) ==
                           sizeof(SpareUp_def)); // sw fault
-    SB_util_static_assert(sizeof(MS_Mon_TmSyncNotice_def) ==
-                          sizeof(TmSyncNotice_def)); // sw fault
     // More structs
     SB_util_static_assert(sizeof(MS_Mon_Monitor_Stats_Type) ==
                           sizeof(lp_msg->u.reply.u.mon_info)); // sw fault
@@ -3690,8 +3611,6 @@
                           sizeof(lp_msg->u.reply.u.process_info.process)/MAX_PROCINFO_LIST); // sw fault
     SB_util_static_assert(sizeof(MS_Mon_Reg_Get_Type) ==
                           sizeof(lp_msg->u.reply.u.get)); // sw fault
-    SB_util_static_assert(sizeof(MS_Mon_Trans_Info_Type) ==
-                          sizeof(lp_msg->u.reply.u.trans_info)); // sw fault
     SB_util_static_assert(sizeof(MS_Mon_Zone_Info_Type) ==
                           sizeof(lp_msg->u.reply.u.zone_info)); // sw fault
     // Check struct offsets
@@ -5495,7 +5414,7 @@
         SB_Trans::Trans_Stream::close_nidpid_streams(false);
 
         if (gv_ms_trans_sock)
-            SB_Trans::Sock_Stream::close_streams();
+            SB_Trans::Sock_Stream::close_streams(false);
     }
 
     // shutdown ms
@@ -5656,7 +5575,7 @@
 //
 // Purpose: handle process startup
 //
-SB_Export int msg_mon_process_startup3(int pv_sysmsgs, int pv_pipeio)
+SB_Export int msg_mon_process_startup3(int pv_sysmsgs, int pv_pipeio, bool pv_remap_stderr)
 SB_THROWS_FATAL {
     SB_API_CTR (lv_zctr, MSG_MON_PROCESS_STARTUP3);
 
@@ -5665,7 +5584,8 @@
                                        gv_ms_attach,
                                        true,       // eventmsgs
                                        pv_pipeio,
-                                       false);     // altsig
+                                       false,
+                                       pv_remap_stderr);     // altsig
 }
 
 //
@@ -5690,7 +5610,8 @@
                                 bool pv_attach,
                                 bool pv_eventmsgs,
                                 bool pv_pipeio,
-                                bool pv_altsig)
+                                bool pv_altsig,
+                                bool pv_remap_stderr)
 SB_THROWS_FATAL {
     const char   *WHERE = "msg_mon_process_startup";
     char         *lp_s;
@@ -5721,7 +5642,7 @@
     gv_ms_su_pipeio = pv_pipeio;
     gv_ms_su_altsig = pv_altsig;
 
-    int lv_fserr = msg_mon_process_startup_ph1(pv_attach, pv_altsig);
+    int lv_fserr = msg_mon_process_startup_ph1(pv_attach, pv_altsig, pv_remap_stderr);
     if (gp_local_mon_io != NULL)
         gv_ms_mon_calls_ok = true;
     if (gv_ms_trace_mon)
@@ -5732,7 +5653,7 @@
 //
 // Purpose: handle process startup
 //
-int msg_mon_process_startup_ph1(bool pv_attach, bool pv_altsig)
+int msg_mon_process_startup_ph1(bool pv_attach, bool pv_altsig, bool pv_remap_stderr)
 SB_THROWS_FATAL {
     const char           *WHERE = "msg_mon_process_startup_ph1";
     Mon_Shared_Msg_Type  *lp_msg;
@@ -5827,8 +5748,6 @@
     SB_util_assert_pne(gp_local_mon_io, NULL);
     lv_ret = gp_local_mon_io->set_cb(msg_mon_recv_msg_loc_cbt, "recv");
     SB_util_assert_if(lv_ret);
-    lv_ret = gp_local_mon_io->set_cb(msg_mon_recv_unsol_msg_loc_cbt, "unsol");
-    SB_util_assert_if(lv_ret);
     lv_ret = gp_local_mon_io->set_cb(msg_mon_recv_notice_msg_loc_cbt, "notice");
     SB_util_assert_if(lv_ret);
 
@@ -5959,7 +5878,7 @@
                     // Connect to monitor via pipes and remap stdout and stderr
                     if (gv_ms_su_pipeio)
                         ms_fifo_setup(1, lp_msg->msg.u.reply.u.startup_info.fifo_stdout);
-                    ms_fifo_setup(2, lp_msg->msg.u.reply.u.startup_info.fifo_stderr);
+                    ms_fifo_setup(2, lp_msg->msg.u.reply.u.startup_info.fifo_stderr, pv_remap_stderr);
                 }
                 if (gv_ms_trace_name) {
                     sprintf(ga_ms_su_trace_pname, "%s:%d/%d",
@@ -6038,12 +5957,6 @@
     case MsgType_Shutdown:
         msg_mon_recv_msg_shutdown(lp_msg);
         break;
-    case MsgType_TmSyncAbort:
-        msg_mon_recv_msg_tmsync_abort(lp_msg);
-        break;
-    case MsgType_TmSyncCommit:
-        msg_mon_recv_msg_tmsync_commit(lp_msg);
-        break;
     default:
         msg_mon_recv_msg_unknown(lp_msg);
         break;
@@ -6557,14 +6470,6 @@
     pp_msg = pp_msg; // touch
 }
 
-void msg_mon_recv_msg_tmsync_abort(Mon_Msg_Type *pp_msg) {
-    pp_msg = pp_msg; // touch
-}
-
-void msg_mon_recv_msg_tmsync_commit(Mon_Msg_Type *pp_msg) {
-    pp_msg = pp_msg; // touch
-}
-
 void msg_mon_recv_msg_unknown(Mon_Msg_Type *pp_msg) {
     pp_msg = pp_msg; // touch
 }
@@ -6580,51 +6485,6 @@
         msg_mon_recv_msg_cbt_discard(WHERE, lp_md, false);
 }
 
-void msg_mon_recv_unsol_msg_loc_cbt(Mon_Msg_Type *pp_msg, int) {
-    const char   *WHERE = "msg_mon_recv_unsol_msg_loc_cbt";
-    Mon_Msg_Type *lp_msg;
-    int           lv_cbret;
-    int           lv_err;
-    int           lv_handle;
-
-    if (gv_ms_tmsync_callback != NULL) {
-        SB_util_assert_ieq(pp_msg->type, MsgType_UnsolicitedMessage); // sw fault
-        SB_util_assert_ieq(pp_msg->u.request.type, ReqType_TmSync); // sw fault
-
-        lv_handle = pp_msg->u.request.u.unsolicited_tm_sync.handle;
-        if (gv_ms_trace_mon) {
-            trace_where_printf(WHERE, "received tmsync req from mon, p-id=%d/%d, handle=%d len=%d\n",
-                               pp_msg->u.request.u.unsolicited_tm_sync.nid,
-                               pp_msg->u.request.u.unsolicited_tm_sync.pid,
-                               lv_handle,
-                               pp_msg->u.request.u.unsolicited_tm_sync.length);
-            trace_print_data(pp_msg->u.request.u.unsolicited_tm_sync.data,
-                             pp_msg->u.request.u.unsolicited_tm_sync.length,
-                             pp_msg->u.request.u.unsolicited_tm_sync.length);
-        }
-        lv_cbret = gv_ms_tmsync_callback(pp_msg->u.request.u.unsolicited_tm_sync.data,
-                                         pp_msg->u.request.u.unsolicited_tm_sync.length,
-                                         lv_handle);
-    } else {
-        if (gv_ms_trace_mon)
-            trace_where_printf(WHERE, "no tmsync callback, replying with error\n");
-        lv_handle = pp_msg->u.request.u.unsolicited_tm_sync.handle;
-        lv_cbret = 1; // set error
-    }
-    lv_err = gp_local_mon_io->acquire_msg(&lp_msg);
-    SB_util_assert_ieq(lv_err, 0); // TODO: revisit
-    lp_msg->type = MsgType_UnsolicitedMessage;
-    lp_msg->noreply = true;
-    lp_msg->u.reply.type = ReplyType_TmSync;
-    lp_msg->u.reply.u.unsolicited_tm_sync.nid = gv_ms_su_nid;
-    lp_msg->u.reply.u.unsolicited_tm_sync.pid = gv_ms_su_pid;
-    lp_msg->u.reply.u.unsolicited_tm_sync.handle = lv_handle;
-    lp_msg->u.reply.u.unsolicited_tm_sync.return_code = lv_cbret;
-    lp_msg->reply_tag = 0;
-    lv_err = gp_local_mon_io->send(lp_msg);
-    SB_util_assert_ieq(lv_err, 0); // TODO: revisit
-}
-
 //
 // Purpose: get registry
 //
@@ -8614,93 +8474,6 @@
 }
 
 //
-// Purpose: issue tmsync
-//
-SB_Export int msg_mon_tmsync_issue(void *pp_data,
-                                   int   pv_len,
-                                   int  *pp_handle,
-                                   int   pv_tag) {
-    const char   *WHERE = "msg_mon_tmsync_issue";
-    Mon_Msg_Type *lp_msg;
-    int           lv_mpierr;
-    SB_API_CTR   (lv_zctr, MSG_MON_TMSYNC_ISSUE);
-
-    SB_UTRACE_API_ADD2(SB_UTRACE_API_OP_MSG_MON_TMSYNC_ISSUE, 0);
-
-    if (gv_ms_trace_mon)
-        trace_where_printf(WHERE, "ENTER data=%p, len=%d, tag=%d\n",
-                           pp_data, pv_len, pv_tag);
-    if (!gv_ms_mon_calls_ok) // msg_mon_tmsync_issue
-        return ms_err_rtn_msg(WHERE, "msg_init() or startup not called or shutdown",
-                              XZFIL_ERR_INVALIDSTATE);
-
-    Mon_Msg_Auto lv_msg;
-    lp_msg = &lv_msg;
-    lp_msg->type = MsgType_Service;
-    lp_msg->noreply = false;
-    lp_msg->u.request.type = ReqType_TmSync;
-    lp_msg->u.request.u.tm_sync.nid = gv_ms_su_nid;
-    lp_msg->u.request.u.tm_sync.pid = gv_ms_su_pid;
-    lp_msg->u.request.u.tm_sync.length = pv_len;
-    lp_msg->u.request.u.tm_sync.tag = pv_tag;
-
-    memcpy(lp_msg->u.request.u.tm_sync.data, pp_data, pv_len);
-
-    if (gv_ms_trace_mon) {
-        trace_where_printf(WHERE, "send tmsync req to mon, p-id=%d/%d\n",
-                           lp_msg->u.request.u.tm_sync.nid,
-                           lp_msg->u.request.u.tm_sync.pid);
-        trace_print_data(pp_data, pv_len, pv_len);
-    }
-
-    lv_mpierr = msg_mon_sendrecv_mon(WHERE,
-                                     "tmsync",
-                                     lp_msg,
-                                     lv_msg.get_error());
-    if (msg_mon_msg_ok(WHERE,
-                       "tmsync req",
-                       &lv_mpierr,
-                       lp_msg,
-                       MsgType_Service,
-                       ReplyType_TmSync)) {
-        lv_mpierr = lp_msg->u.reply.u.tm_sync.return_code;
-        if (lv_mpierr == MPI_SUCCESS) {
-            *pp_handle = lp_msg->u.reply.u.tm_sync.handle;
-            if (gv_ms_trace_mon)
-                trace_where_printf(WHERE, "EXIT OK tmsync req, handle=%d\n",
-                                   *pp_handle);
-        } else {
-            *pp_handle = -1;
-            if (gv_ms_trace_mon)
-                trace_where_printf(WHERE, "EXIT FAILURE tmsync, ret=%d\n",
-                                   lv_mpierr);
-        }
-    }
-    return ms_err_mpi_rtn_msg(WHERE, "EXIT", lv_mpierr);
-}
-
-//
-// Purpose: register tmsync callback
-//
-SB_Export int msg_mon_tmsync_register(MS_Mon_TmSync_Cb_Type pv_callback) {
-    const char *WHERE = "msg_mon_tmsync_register";
-    SB_API_CTR (lv_zctr, MSG_MON_TMSYNC_REGISTER);
-
-    SB_UTRACE_API_ADD2(SB_UTRACE_API_OP_MSG_MON_TMSYNC_REGISTER, 0);
-    if (gv_ms_trace_mon)
-        trace_where_printf(WHERE, "ENTER\n");
-    if (!gv_ms_calls_ok) // msg_mon_tmsync_register
-        return ms_err_rtn_msg(WHERE, "msg_init() not called or shutdown",
-                              XZFIL_ERR_INVALIDSTATE);
-    gv_ms_tmsync_callback = pv_callback;
-    int lv_fserr = XZFIL_ERR_OK;
-    if (gv_ms_trace_mon)
-        trace_where_printf(WHERE, "EXIT OK, ret=%d\n", lv_fserr);
-    return lv_fserr;
-}
-
-
-//
 // Purpose: delist trans
 //
 SB_Export int msg_mon_trace_register_change(MS_Mon_Trace_Cb_Type pv_callback) {
@@ -8740,9 +8513,6 @@
     case MsgType_NodeQuiesce:
         msg_mon_trace_msg_node_quiesce(pp_where, pp_msg);
         break;
-    case MsgType_NodePrepare:
-        msg_mon_trace_msg_node_prepare(pp_where, pp_msg);
-        break;
     case MsgType_NodeUp:
         msg_mon_trace_msg_node_up(pp_where, pp_msg);
         break;
@@ -8758,12 +8528,6 @@
     case MsgType_Shutdown:
         msg_mon_trace_msg_shutdown(pp_where, pp_msg);
         break;
-    case MsgType_TmSyncAbort:
-        msg_mon_trace_msg_tmsync(pp_where, pp_msg);
-        break;
-    case MsgType_TmSyncCommit:
-        msg_mon_trace_msg_tmsync(pp_where, pp_msg);
-        break;
     default:
         msg_mon_trace_msg_unknown(pp_where, pp_msg);
         break;
@@ -8807,13 +8571,6 @@
                        pp_msg->u.request.u.quiesce.node_name);
 }
 
-void msg_mon_trace_msg_node_prepare(const char   *pp_where,
-                                    Mon_Msg_Type *pp_msg) {
-    trace_where_printf(pp_where, "mon-msg-node-prepare nid=%d, node-name=%s\n",
-                       pp_msg->u.request.u.prepare.nid,
-                       pp_msg->u.request.u.prepare.node_name);
-}
-
 void msg_mon_trace_msg_node_up(const char *pp_where, Mon_Msg_Type *pp_msg) {
     trace_where_printf(pp_where, "mon-msg-node-up nid=%d, node-name=%s\n",
                        pp_msg->u.request.u.up.nid,
@@ -8880,55 +8637,6 @@
                        pp_msg->u.request.u.shutdown.level);
 }
 
-void msg_mon_trace_msg_tmsync(const char   *pp_where,
-                              Mon_Msg_Type *pp_msg) {
-    int lv_orig_count = pp_msg->u.request.u.tm_sync_notice.orig_count;
-    char la_orig_tag_handle_str[MAX_TM_SYNCS * 30 + 40];
-    if (lv_orig_count > 0) {
-        int *lp_tags = pp_msg->u.request.u.tm_sync_notice.orig_tag;
-        int lv_pcount = lv_orig_count;
-        if (lv_orig_count > MAX_TM_SYNCS)
-            lv_pcount = MAX_TM_SYNCS;
-        char *lp_p = la_orig_tag_handle_str;
-        int *lp_handles = pp_msg->u.request.u.tm_sync_notice.orig_handle;
-        for (int lv_hdl = 0; lv_hdl < lv_pcount; lv_hdl++) {
-            sprintf(lp_p, "%d/%d ", lp_tags[lv_hdl], lp_handles[lv_hdl]);
-            lp_p += strlen(lp_p);
-        }
-        if (lv_pcount != lv_orig_count)
-           strcpy(lp_p, "** truncated **");
-    } else
-        strcpy(la_orig_tag_handle_str, "<none>");
-    const char *lp_msg_type = msg_util_get_msg_type(pp_msg->type);
-    int la_nid[MAX_TM_SYNCS];
-    int lv_count = pp_msg->u.request.u.tm_sync_notice.count;
-    char la_handle_str[MAX_TM_SYNCS * 14 + 40];
-    if (lv_count > 0) {
-        int lv_pcount = lv_count;
-        if (lv_count > MAX_TM_SYNCS)
-            lv_pcount = MAX_TM_SYNCS;
-        for (int lv_inx = 0; lv_inx < lv_pcount; lv_inx++) {
-            la_nid[lv_inx] = pp_msg->u.request.u.tm_sync_notice.nid[lv_inx];
-        }
-        char *lp_p = la_handle_str;
-        int *lp_handles = pp_msg->u.request.u.tm_sync_notice.handle;
-        for (int lv_hdl = 0; lv_hdl < lv_pcount; lv_hdl++) {
-            sprintf(lp_p, "%d ", lp_handles[lv_hdl]);
-            lp_p += strlen(lp_p);
-        }
-        if (lv_pcount != lv_count)
-           strcpy(lp_p, "** truncated **");
-    } else
-        strcpy(la_handle_str, "<none>");
-    trace_where_printf(pp_where, "mon-msg-%s mon message, nid=%d, orig_count=%d, orig_tag-handles=%s, count=%d, handles=%s\n",
-                       lp_msg_type,
-                       la_nid[0],
-                       lv_orig_count,
-                       la_orig_tag_handle_str,
-                       lv_count,
-                       la_handle_str);
-}
-
 void msg_mon_trace_msg_unknown(const char *pp_where, Mon_Msg_Type *pp_msg) {
     pp_msg = pp_msg; // touch
     trace_where_printf(pp_where, "mon-msg-unknown, type=%d\n", pp_msg->type);
diff --git a/core/sqf/src/seabed/src/msx.h b/core/sqf/src/seabed/src/msx.h
index 2120948..f1d679e 100644
--- a/core/sqf/src/seabed/src/msx.h
+++ b/core/sqf/src/seabed/src/msx.h
@@ -100,7 +100,7 @@
 extern short                 ms_err_mpi_rtn_msg_noassert(const char *pp_where,
                                                          const char *pp_msg,
                                                          int         pv_mpierr);
-extern void                  ms_fifo_setup(int pv_orig_fd, char *pp_fifo_name);
+extern void                  ms_fifo_setup(int pv_orig_fd, char *pp_fifo_name, bool pv_remap_fd = true); // pv_remap_fd to monitor
 extern void                  ms_free_recv_bufs(MS_Md_Type *pp_md);
 extern SB_Comp_Queue        *ms_fsdone_get_comp_q(bool pv_ts);
 extern void                  ms_gather_info(const char *pp_where);
diff --git a/core/sqf/src/seabed/src/otrace.cpp b/core/sqf/src/seabed/src/otrace.cpp
index ae111f8..f85a584 100644
--- a/core/sqf/src/seabed/src/otrace.cpp
+++ b/core/sqf/src/seabed/src/otrace.cpp
@@ -113,7 +113,7 @@
 
 SB_Trace::~SB_Trace() {
     if (ip_trace_mem_buf != NULL) {
-        fprintf(ip_trace_file, ip_trace_mem_buf);
+        fprintf(ip_trace_file,"%s", ip_trace_mem_buf);
         delete [] ip_trace_mem_buf;
         ip_trace_mem_buf = NULL;
     }
@@ -152,7 +152,7 @@
 void SB_Trace::trace_flush() {
     if (ip_trace_file != NULL) {
         if (ip_trace_mem_buf != NULL) {
-            fprintf(ip_trace_file, ip_trace_mem_buf);
+            fprintf(ip_trace_file,"%s", ip_trace_mem_buf);
             iv_trace_mem_inx = 0;
         }
         fflush(ip_trace_file);
@@ -734,7 +734,7 @@
             getpid(), gettid(),
             pp_file, pv_line, pp_fun,
             pp_exp);
-    fprintf(stderr, la_buf);
+    fprintf(stderr,"%s", la_buf);
     fflush(stderr);
     abort(); // can't use SB_util_abort
 }
@@ -766,7 +766,7 @@
             getpid(), gettid(),
             pp_file, pv_line, pp_fun,
             pp_exp, pv_lhs, pv_rhs);
-    fprintf(stderr, la_buf);
+    fprintf(stderr,"%s", la_buf);
     fflush(stderr);
     abort(); // can't use SB_util_abort
 }
@@ -798,7 +798,7 @@
             getpid(), gettid(),
             pp_file, pv_line, pp_fun,
             pp_exp, pp_lhs, pp_rhs);
-    fprintf(stderr, la_buf);
+    fprintf(stderr, "%s",la_buf);
     fflush(stderr);
     abort(); // can't use SB_util_abort
 }
diff --git a/core/sqf/src/seabed/src/sock.cpp b/core/sqf/src/seabed/src/sock.cpp
index 683225d..95c9f57 100644
--- a/core/sqf/src/seabed/src/sock.cpp
+++ b/core/sqf/src/seabed/src/sock.cpp
@@ -42,6 +42,7 @@
 
 #include "buf.h"
 #include "mstrace.h"
+#include "msx.h"
 #include "socktrans.h"
 
 #ifndef AF_INET_SDP
@@ -207,6 +208,8 @@
     }
     if (lv_sock == -1)
         return lv_errno;
+    lv_err = getGlobalSockCtrl()->set_keepalive(WHERE, lv_sock);
+    SB_util_assert_ieq(lv_err, 0);
     lv_err = getGlobalSockCtrl()->set_nodelay(WHERE, lv_sock);
     SB_util_assert_ieq(lv_err, 0);
     lv_err = getGlobalSockCtrl()->set_size_recv(WHERE, lv_sock, SIZE);
@@ -303,12 +306,21 @@
                                           int         pv_fd,
                                           int         pv_event,
                                           void       *pp_data) {
+
+    static bool         sv_ignore_enoent = true;
+    static bool         sv_envvar_ignore_enoent_read = false;
+
     char                la_errno[100];
     const char         *lp_op;
     int                 lv_err;
     int                 lv_errno;
     struct epoll_event  lv_event;
 
+    if (! sv_envvar_ignore_enoent_read) {
+      sv_envvar_ignore_enoent_read = true;
+      ms_getenv_bool("SQ_SB_IGNORE_ENOENT", &sv_ignore_enoent);
+    }
+
     lv_event.events = pv_event;
     lv_event.data.ptr = pp_data;
     lv_err = ::epoll_ctl(iv_efd, pv_op, pv_fd, &lv_event);
@@ -332,6 +344,21 @@
                                "epoll-ctl op=%d(%s), fd=%d, event=%d, data=%p, err=%d\n",
                                pv_op, lp_op, pv_fd, pv_event, pp_data, lv_err);
     }
+
+    if ((sv_ignore_enoent) &&
+        (lv_err == -1) &&
+        (lv_errno == ENOENT) &&
+        ((pv_op == EPOLL_CTL_MOD) ||
+         (pv_op == EPOLL_CTL_DEL))) {
+        lp_op = sock_get_label_epoll_ctl(pv_op);
+        SB_Buf_Line la_buf;
+        sprintf(la_buf, 
+            "epoll_ctl ignoring ENOENT op=%d(%s), fd=%d, event=%d, data=%p, err=%d\n",
+            pv_op, lp_op, pv_fd, pv_event, pp_data, lv_err);
+        sb_util_write_log(la_buf);
+        return;
+    }
+
     SB_util_assert_ine(lv_err, -1);
 }
 
@@ -405,6 +432,54 @@
     return lv_err;
 }
 
+int SB_Trans::Sock_Controller::set_keepalive(const char *pp_where,
+                                             int         pv_sock) {
+    int lv_err;
+    
+    static bool sv_envvar_read = false;
+    static int sv_sockkeepalive = 1;
+    static int sv_tcpkeepidle = 240;
+    static int sv_tcpkeepintvl = 6;
+    static int sv_tcpkeepcnt = 10;
+
+    if (! sv_envvar_read) {
+      sv_envvar_read = true;
+      ms_getenv_int("SQ_SB_KEEPALIVE", &sv_sockkeepalive);
+      ms_getenv_int("SQ_SB_KEEPIDLE", &sv_tcpkeepidle);
+      ms_getenv_int("SQ_SB_KEEPINTVL", &sv_tcpkeepintvl);
+      ms_getenv_int("SQ_SB_KEEPCNT", &sv_tcpkeepcnt);
+    }
+
+    lv_err = setsockopt(pv_sock,
+                        SOL_SOCKET,
+                        SO_KEEPALIVE,
+                        reinterpret_cast<char *>(&sv_sockkeepalive),
+                        sizeof(sv_sockkeepalive));
+
+    lv_err = setsockopt(pv_sock,
+                        IPPROTO_TCP,
+                        TCP_KEEPIDLE,
+                        reinterpret_cast<char *>(&sv_tcpkeepidle),
+                        sizeof(sv_tcpkeepidle));
+
+    lv_err = setsockopt(pv_sock,
+                        IPPROTO_TCP,
+                        TCP_KEEPINTVL,
+                        reinterpret_cast<char *>(&sv_tcpkeepintvl),
+                        sizeof(sv_tcpkeepintvl));
+
+    lv_err = setsockopt(pv_sock,
+                        IPPROTO_TCP,
+                        TCP_KEEPCNT,
+                        reinterpret_cast<char *>(&sv_tcpkeepcnt),
+                        sizeof(sv_tcpkeepcnt));
+
+    if (gv_ms_trace_sock)
+        trace_where_printf(pp_where, "setsockopt KEEPALIVE sock=%d, err=%d\n",
+                           pv_sock, lv_err);
+    return lv_err;
+}
+
 int SB_Trans::Sock_Controller::set_nonblock(const char *pp_where,
                                             int         pv_sock) {
     int          lv_err;
@@ -788,8 +863,16 @@
     if (gv_ms_trace_sock)
         trace_where_printf(WHERE, "bind complete, sock=%d\n",
                            lv_sock);
-    lv_err = ::listen(lv_sock, 10);
+
+    static bool sv_envvar_read = false;
+    static int sv_listen_backlog = 1024;
+    if (! sv_envvar_read) {
+      sv_envvar_read = true;
+      ms_getenv_int("SQ_SB_LISTEN_BACKLOG", &sv_listen_backlog);
+    }
+    lv_err = ::listen(lv_sock, sv_listen_backlog);
     SB_util_assert_ine(lv_err, -1);
+
     iv_sock = lv_sock;
     lv_len = sizeof(lv_addr);
     lv_err = getsockname(lv_sock, reinterpret_cast<struct sockaddr *>(&lv_addr), &lv_len);
diff --git a/core/sqf/src/seabed/src/sockstream.cpp b/core/sqf/src/seabed/src/sockstream.cpp
index 83ba1aa..0a4ef08 100644
--- a/core/sqf/src/seabed/src/sockstream.cpp
+++ b/core/sqf/src/seabed/src/sockstream.cpp
@@ -29,6 +29,7 @@
 #include <string.h>
 #include <unistd.h>
 
+#include <netdb.h>
 #include <net/if.h>
 #include <net/if_arp.h>
 
@@ -269,7 +270,7 @@
 //
 // Purpose: close streams
 //
-void SB_Trans::Sock_Stream::close_streams() {
+void SB_Trans::Sock_Stream::close_streams(bool pv_join) {
     const char *WHERE = "Sock_Stream::close_streams";
     int         lv_status;
 
@@ -305,11 +306,13 @@
         if (gv_ms_trace_sock)
             trace_where_printf(WHERE, "shutdown accept thread\n");
         cp_accept_thread->fin();
-        lv_status = cp_accept_thread->join(&lp_result);
-        if (gv_ms_trace_sock)
-            trace_where_printf(WHERE, "accept thread death, status=%d\n",
-                               lv_status);
-        SB_util_assert_ieq(lv_status, 0);
+        if (pv_join) {
+            lv_status = cp_accept_thread->join(&lp_result);
+            if (gv_ms_trace_sock)
+                trace_where_printf(WHERE, "accept thread death, status=%d\n",
+                                   lv_status);
+            SB_util_assert_ieq(lv_status, 0);
+        }
         cp_accept_thread = NULL;
     }
     if (cp_helper_thread != NULL) {
@@ -1258,6 +1261,7 @@
     static bool         lv_first = true;
     bool                lv_ok;
     static int          lv_port;
+    struct hostent     *lp_hostent = NULL;
 
     if (lv_first) {
         lv_first = false;
@@ -1270,6 +1274,12 @@
         } else {
             lv_err = gethostname(la_host, sizeof(la_host));
             SB_util_assert_if(lv_err);
+            lp_hostent = gethostbyname( la_host );
+            SB_util_assert_if(!lp_hostent);
+            lp_addr = reinterpret_cast<unsigned char *>(lp_hostent->h_addr);
+            sprintf(la_host, "%d.%d.%d.%d",
+                    lp_addr[0], lp_addr[1], lp_addr[2], lp_addr[3]);
+            
         }
         lp_host = la_host;
         cp_listener = new Sock_Listener();
@@ -2048,7 +2058,7 @@
 // (static)
 //
 void SB_Trans::Sock_Stream::shutdown() {
-    close_streams();
+    close_streams(false);
 }
 
 void SB_Trans::Sock_Stream::sock_free() {
diff --git a/core/sqf/src/seabed/src/socktrans.h b/core/sqf/src/seabed/src/socktrans.h
index 69b88e7..51fe2e8 100644
--- a/core/sqf/src/seabed/src/socktrans.h
+++ b/core/sqf/src/seabed/src/socktrans.h
@@ -70,6 +70,8 @@
         void        epoll_wait(const char *pp_where,
                                int         pv_timeout);
         void        lock();
+        int         set_keepalive(const char *pp_where,
+                                  int         pv_sock);
         int         set_nodelay(const char *pp_where,
                                 int         pv_sock);
         int         set_nonblock(const char *pp_where,
@@ -286,7 +288,7 @@
 
         static int          check_streams();
         static void         close_stream(Sock_Stream *pp_stream, bool pv_local);
-        static void         close_streams();
+        static void         close_streams(bool pv_join);
         // execute functions
         virtual short       exec_abandon(MS_Md_Type *pp_md,
                                          int         pv_reqid,
diff --git a/core/sqf/src/seabed/src/threadl.cpp b/core/sqf/src/seabed/src/threadl.cpp
index 2a04a52..1d63a3e 100644
--- a/core/sqf/src/seabed/src/threadl.cpp
+++ b/core/sqf/src/seabed/src/threadl.cpp
@@ -501,7 +501,7 @@
     int lv_status;
 
     lv_status = destroy();
-    SB_util_assert_ieq(lv_status, 0); // sw fault
+    // NOTE: Disabling this assert:   SB_util_assert_ieq(lv_status, 0); // sw fault
     lv_status = lv_status; // touch (in case assert disabled)
 }
 #else
diff --git a/core/sqf/src/seabed/src/threadtls.cpp b/core/sqf/src/seabed/src/threadtls.cpp
index 028e2cd..c73466d 100644
--- a/core/sqf/src/seabed/src/threadtls.cpp
+++ b/core/sqf/src/seabed/src/threadtls.cpp
@@ -76,7 +76,7 @@
             getpid(), gettid(),
             pp_file, pv_line, pp_fun,
             pp_exp);
-    fprintf(stderr, la_buf);
+    fprintf(stderr, "%s", la_buf);
     fflush(stderr);
     abort(); // can't use SB_util_abort
 }
diff --git a/core/sqf/src/seabed/src/util.h b/core/sqf/src/seabed/src/util.h
index 923867c..fea6ac9 100644
--- a/core/sqf/src/seabed/src/util.h
+++ b/core/sqf/src/seabed/src/util.h
@@ -52,6 +52,7 @@
 extern void  SB_util_short_lock();
 extern void  SB_util_short_unlock();
 
+extern void sb_util_write_log(char *pp_buf);
 
 // make printf happy
 #  define pfp(p)  (void *) (long) p
diff --git a/core/sqf/src/seabed/src/utracex.h b/core/sqf/src/seabed/src/utracex.h
index ccb51f3..61f5ffc 100644
--- a/core/sqf/src/seabed/src/utracex.h
+++ b/core/sqf/src/seabed/src/utracex.h
@@ -117,6 +117,7 @@
     SB_UTRACE_API_OP_MSG_MON_GET_TRANS_INFO_TRANSID,
     SB_UTRACE_API_OP_MSG_MON_GET_ZONE_INFO,
     SB_UTRACE_API_OP_MSG_MON_GET_ZONE_INFO_DETAIL,
+    SB_UTRACE_API_OP_MSG_MON_GET_INSTANCE_ID,
     SB_UTRACE_API_OP_MSG_MON_MOUNT_DEVICE,
     SB_UTRACE_API_OP_MSG_MON_MOUNT_DEVICE2,
     SB_UTRACE_API_OP_MSG_MON_NODE_DOWN,
diff --git a/core/sqf/src/seabed/src/xchkhdr.sh b/core/sqf/src/seabed/src/xchkhdr.sh
index ec5daf3..d8b1dfa 100644
--- a/core/sqf/src/seabed/src/xchkhdr.sh
+++ b/core/sqf/src/seabed/src/xchkhdr.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 #
 # @@@ START COPYRIGHT @@@
 #
diff --git a/core/sqf/src/seabed/test/Makefile b/core/sqf/src/seabed/test/Makefile
index 6b1e47f..f4db08d 100644
--- a/core/sqf/src/seabed/test/Makefile
+++ b/core/sqf/src/seabed/test/Makefile
@@ -24,12 +24,12 @@
 		  t20 t21 t22 t23 t24 t25     t27 \
 		  t30 t31 t32         t35 t36 t37 t38 t39 \
 		  t40 t41 t42 t43 t44             t48 t49 \
-		  t50 t51 t52             t56 t57 t58 \
+		  t50     t52             t56 t57 t58 \
 		      t61 t62 t63 t64 t65 t66 t67 t68 t69 \
 		  t70 t71 t72 t73 t74 t75 t76 t77 t78 t79 \
 		  t80         t83 t84 t85 t86     t88 \
 		      t91 t92 t93 t94     t96     t98 \
-		  t100 t101 t102 t103 t104 t105                     \
+		  t100 t101 t102 t103 t104                          \
 		                           t115           t118      \
 		                      t124           t127 t128 t129 \
 		  t130 t131 t132 t133      t135      t137 t138 t139 \
@@ -43,11 +43,11 @@
 		  t210      t212 t213 t214 t215 t216 t217      t219 \
 		  t220 t221 t222                t226 t227 t228 \
 		       t231      t233 t234 t235           t238      \
-		  t240 t241 t242           t245 t246                \
+		  t240 t241 t242           t245                \
 		                 t253                               \
-		       t261 t262                t266           \
+		            t262                                    \
 		                                     t277      t279 \
-		  t280 t281 t282      t284      t286 \
+		  t280 t281 t282      t284      t286 t287\
 		  tcheck tdelta tmerge tsock ttee
 PROGSI		= t2fs t3ms t4ms \
 		  t5ms t6ms t9thread t14ms \
@@ -95,12 +95,11 @@
 		  t233ms t234ms \
 		  t235ms t238ms \
 		  t240ms t241ms t242fs \
-		  t245ms t246ms \
+		  t245ms \
 		  t253ms \
 		  t261ms t262ms \
-		  t266ms \
-	          t277fs t279ms \
-	          t280fs t281ms t282ms t284cli t284srv
+		  t277fs t279ms \
+		  t280fs t281ms t282ms t284cli t284srv
 
 ###########################################################################
 
@@ -148,11 +147,11 @@
 # rule macros
 RCXXINCLUDES	= $(CXX) $(CDEPFLAGS) $(CXXFLAGS) $(INCLUDESX) -o $@ -c $<
 RCXXINCLUDESSRC	= $(CXX) $(CDEPFLAGS) $(CXXFLAGS) $(INCLUDESSRCX) -o $@ -c $<
-RFS		= $(LIBTMSUTILX) $(LIBTUTILX) $(LIBTUTILPX) $(LIBSBFSX) $(DEBUG)
-RMS		= $(LIBTMSUTILX) $(LIBTUTILX) $(LIBTUTILPX) $(LIBSBMSX) $(DEBUG)
+RFS		= $(LIBTMSUTILX) $(LIBTUTILX) $(LIBTUTILPX) $(LIBSBUTILX) $(LIBSBFSX) $(DEBUG)
+RMS		= $(LIBTMSUTILX) $(LIBTUTILX) $(LIBTUTILPX) $(LIBSBUTILX) $(LIBSBMSX) $(DEBUG)
 RMSEG		= $(LIBSBMSX) $(DEBUG)
-RMSUTP		= $(LIBSBMSX) $(LIBTUTILPX) $(DEBUG)
-RMX		= $(LIBTUTILX) $(LIBTUTILPX) $(DEBUG)
+RMSUTP		= $(LIBSBMSX) $(LIBTUTILPX) $(LIBSBUTILX) $(DEBUG)
+RMX		= $(LIBTUTILX) $(LIBTUTILPX) $(LIBSBUTILX) $(DEBUG)
 # target macros
 TFS		= $(LIBTMSUTIL) $(LIBTUTIL) $(LIBTUTILP) $(LIBSBFS)
 TMS		= $(LIBTMSUTIL) $(LIBTUTIL) $(LIBTUTILP) $(LIBSBMS)
@@ -160,6 +159,10 @@
 TMSUTP		= $(LIBSBMS) $(LIBTUTILP)
 TMX		= $(LIBTUTIL) $(LIBTUTILP)
 
+# Targets
+# Uncomment the following target to disable parallel make
+.NOTPARALLEL:
+
 all: $(PROGS)
 
 t2: $(CHK) t2fs
@@ -672,7 +675,7 @@
 
 t157: $(CHK) t157ms
 t157ms: $(OUTDIR)/t157ms.o $(TMX)
-	$(LINKITCXX) -o $@ $< $(RMX) -lpthread
+	$(LINKITCXX) -o $@ $< $(RMX) -lpthread -lrt
 
 t160: $(CHK) t160ms
 t160ms: $(OUTDIR)/t160ms.o $(TMS)
@@ -740,7 +743,7 @@
 
 t184: $(CHK) t184fs
 t184fs: $(OUTDIR)/t184fs.o $(TFS)
-	$(LINKITCXX) -o $@ $< $(RFS)
+	$(LINKITCXX) -o $@ $< $(RFS) -lpthread
 
 t185: $(CHK) t185fs
 t185fs: $(OUTDIR)/t185fs.o $(TFS)
@@ -888,7 +891,7 @@
 
 t238: $(CHK) t238ms
 t238ms: $(OUTDIR)/t238ms.o $(LIBEXPDIR)/libsblogalt.so
-	$(LINKITCXX) -o $@ $< -L$(LIBEXPDIR) -lsblogalt
+	$(LINKITCXX) -o $@ $< -L$(LIBEXPDIR) -lsblogalt -lpthread
 
 t240: $(CHK) t240ms
 t240ms: $(OUTDIR)/t240ms.o $(TMS)
@@ -965,7 +968,7 @@
 	$(LINKITCXX) -o $@ $< $(RMS)
 
 t284srv: $(OUTDIR)/t284srv.o $(TMS)
-	$(LINKITCXX) -o $@ $< $(RMS)
+	$(LINKITCXX) -o $@ $< $(RMS) -lrt
 
 t284clij: $(OUTDIR)/com/hp/traf/t284cli.class $(OUTDIR)/com/hp/traf/t284cb.class $(OUTDIR)/com/hp/traf/t284exc.class $(OUTDIR)/com/hp/traf/t284id.class
 $(OUTDIR)/com/hp/traf/t284cli.class $(OUTDIR)/com/hp/traf/t284cb.class $(OUTDIR)/com/hp/traf/t284exc.class $(OUTDIR)/com/hp/traf/t284id.class: t284cli.java t284cb.java t284exc.java t284id.java
@@ -988,6 +991,10 @@
 t286ms: $(OUTDIR)/t286ms.o $(TMS)
 	$(LINKITCXX) -o $@ $< $(RMS)
 
+t287: $(CHK) t287ms
+t287ms: $(OUTDIR)/t287ms.o $(TMS)
+	$(LINKITCXX) -o $@ $< $(RMS)
+
 
 tcheck: $(OUTDIR)/tcheck.o
 	$(LINKITCXX) $(INCLUDESX) -o $@ $<
diff --git a/core/sqf/src/seabed/test/go100 b/core/sqf/src/seabed/test/go100
index 9fe1c52..d51daff 100755
--- a/core/sqf/src/seabed/test/go100
+++ b/core/sqf/src/seabed/test/go100
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go101 b/core/sqf/src/seabed/test/go101
index 9975e06..7017187 100755
--- a/core/sqf/src/seabed/test/go101
+++ b/core/sqf/src/seabed/test/go101
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go102 b/core/sqf/src/seabed/test/go102
index da4f1cc..65db12c 100755
--- a/core/sqf/src/seabed/test/go102
+++ b/core/sqf/src/seabed/test/go102
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go103 b/core/sqf/src/seabed/test/go103
index 0affe81..8d75445 100755
--- a/core/sqf/src/seabed/test/go103
+++ b/core/sqf/src/seabed/test/go103
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go104 b/core/sqf/src/seabed/test/go104
index e1c2796..a51b51b 100755
--- a/core/sqf/src/seabed/test/go104
+++ b/core/sqf/src/seabed/test/go104
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go105 b/core/sqf/src/seabed/test/go105
index 014deaa..2472093 100755
--- a/core/sqf/src/seabed/test/go105
+++ b/core/sqf/src/seabed/test/go105
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go115 b/core/sqf/src/seabed/test/go115
index e41c67f..bea199b 100755
--- a/core/sqf/src/seabed/test/go115
+++ b/core/sqf/src/seabed/test/go115
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go118 b/core/sqf/src/seabed/test/go118
index 02b5cd8..0097472 100755
--- a/core/sqf/src/seabed/test/go118
+++ b/core/sqf/src/seabed/test/go118
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go124 b/core/sqf/src/seabed/test/go124
index 75b35c2..da08b85 100755
--- a/core/sqf/src/seabed/test/go124
+++ b/core/sqf/src/seabed/test/go124
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go126 b/core/sqf/src/seabed/test/go126
index d346791..9c10f77 100755
--- a/core/sqf/src/seabed/test/go126
+++ b/core/sqf/src/seabed/test/go126
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go127 b/core/sqf/src/seabed/test/go127
index cab0692..23e87c7 100755
--- a/core/sqf/src/seabed/test/go127
+++ b/core/sqf/src/seabed/test/go127
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go128 b/core/sqf/src/seabed/test/go128
index d4a0d69..2d74a86 100755
--- a/core/sqf/src/seabed/test/go128
+++ b/core/sqf/src/seabed/test/go128
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go129 b/core/sqf/src/seabed/test/go129
index ce4ea10..48ba824 100755
--- a/core/sqf/src/seabed/test/go129
+++ b/core/sqf/src/seabed/test/go129
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go130 b/core/sqf/src/seabed/test/go130
index c2f75d6..73c5a85 100755
--- a/core/sqf/src/seabed/test/go130
+++ b/core/sqf/src/seabed/test/go130
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go131 b/core/sqf/src/seabed/test/go131
index 48eaf4f..1a9b580 100755
--- a/core/sqf/src/seabed/test/go131
+++ b/core/sqf/src/seabed/test/go131
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go132 b/core/sqf/src/seabed/test/go132
index 1c6e087..8834a85 100755
--- a/core/sqf/src/seabed/test/go132
+++ b/core/sqf/src/seabed/test/go132
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go133 b/core/sqf/src/seabed/test/go133
index 2f131e2..123aba2 100755
--- a/core/sqf/src/seabed/test/go133
+++ b/core/sqf/src/seabed/test/go133
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go135 b/core/sqf/src/seabed/test/go135
index b066918..fe5b0ea 100755
--- a/core/sqf/src/seabed/test/go135
+++ b/core/sqf/src/seabed/test/go135
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go137 b/core/sqf/src/seabed/test/go137
index 093a24a..c3ad1ee 100755
--- a/core/sqf/src/seabed/test/go137
+++ b/core/sqf/src/seabed/test/go137
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go138 b/core/sqf/src/seabed/test/go138
index d0c91ee..4569e1d 100755
--- a/core/sqf/src/seabed/test/go138
+++ b/core/sqf/src/seabed/test/go138
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go139 b/core/sqf/src/seabed/test/go139
index 8d67eae..af3f509 100755
--- a/core/sqf/src/seabed/test/go139
+++ b/core/sqf/src/seabed/test/go139
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go14 b/core/sqf/src/seabed/test/go14
index 28508d3..5cfccdc 100755
--- a/core/sqf/src/seabed/test/go14
+++ b/core/sqf/src/seabed/test/go14
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go141 b/core/sqf/src/seabed/test/go141
index 0db4400..cc28519 100755
--- a/core/sqf/src/seabed/test/go141
+++ b/core/sqf/src/seabed/test/go141
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go142 b/core/sqf/src/seabed/test/go142
index b98818f..5eff171 100755
--- a/core/sqf/src/seabed/test/go142
+++ b/core/sqf/src/seabed/test/go142
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go145 b/core/sqf/src/seabed/test/go145
index f192e21..deed29c 100755
--- a/core/sqf/src/seabed/test/go145
+++ b/core/sqf/src/seabed/test/go145
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go149 b/core/sqf/src/seabed/test/go149
index 8e62d0f..e38ecdd 100755
--- a/core/sqf/src/seabed/test/go149
+++ b/core/sqf/src/seabed/test/go149
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go15 b/core/sqf/src/seabed/test/go15
index 7239101..cfe3360 100755
--- a/core/sqf/src/seabed/test/go15
+++ b/core/sqf/src/seabed/test/go15
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go151 b/core/sqf/src/seabed/test/go151
index cd09974..1e0859b 100755
--- a/core/sqf/src/seabed/test/go151
+++ b/core/sqf/src/seabed/test/go151
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go152 b/core/sqf/src/seabed/test/go152
index f0225d1..2ca8a03 100755
--- a/core/sqf/src/seabed/test/go152
+++ b/core/sqf/src/seabed/test/go152
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go153 b/core/sqf/src/seabed/test/go153
index 9c974be..95a094b 100755
--- a/core/sqf/src/seabed/test/go153
+++ b/core/sqf/src/seabed/test/go153
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go155 b/core/sqf/src/seabed/test/go155
index a666a00..c88e10b 100755
--- a/core/sqf/src/seabed/test/go155
+++ b/core/sqf/src/seabed/test/go155
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go156 b/core/sqf/src/seabed/test/go156
index ab19b19..60e3210 100755
--- a/core/sqf/src/seabed/test/go156
+++ b/core/sqf/src/seabed/test/go156
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go157 b/core/sqf/src/seabed/test/go157
index 9266add..24f17f2 100755
--- a/core/sqf/src/seabed/test/go157
+++ b/core/sqf/src/seabed/test/go157
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go16 b/core/sqf/src/seabed/test/go16
index 1d08346..110930d 100755
--- a/core/sqf/src/seabed/test/go16
+++ b/core/sqf/src/seabed/test/go16
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go160 b/core/sqf/src/seabed/test/go160
index 51c7d5b..c0bbdd8 100755
--- a/core/sqf/src/seabed/test/go160
+++ b/core/sqf/src/seabed/test/go160
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go161 b/core/sqf/src/seabed/test/go161
index 74e2dc3..8196c18 100755
--- a/core/sqf/src/seabed/test/go161
+++ b/core/sqf/src/seabed/test/go161
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go162 b/core/sqf/src/seabed/test/go162
index 6d63baa..d6c2f43 100755
--- a/core/sqf/src/seabed/test/go162
+++ b/core/sqf/src/seabed/test/go162
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go163 b/core/sqf/src/seabed/test/go163
index 2659fa3..6cad0ae 100755
--- a/core/sqf/src/seabed/test/go163
+++ b/core/sqf/src/seabed/test/go163
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go164 b/core/sqf/src/seabed/test/go164
index 6da13e2..bc7727e 100755
--- a/core/sqf/src/seabed/test/go164
+++ b/core/sqf/src/seabed/test/go164
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go165 b/core/sqf/src/seabed/test/go165
index 85fe854..d43b0d3 100755
--- a/core/sqf/src/seabed/test/go165
+++ b/core/sqf/src/seabed/test/go165
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go166 b/core/sqf/src/seabed/test/go166
index 53e289d..7d3c7f6 100755
--- a/core/sqf/src/seabed/test/go166
+++ b/core/sqf/src/seabed/test/go166
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go167 b/core/sqf/src/seabed/test/go167
index e5d5f47..cae7b5b 100755
--- a/core/sqf/src/seabed/test/go167
+++ b/core/sqf/src/seabed/test/go167
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go168 b/core/sqf/src/seabed/test/go168
index c44fb0c..35e293a 100755
--- a/core/sqf/src/seabed/test/go168
+++ b/core/sqf/src/seabed/test/go168
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go169 b/core/sqf/src/seabed/test/go169
index b05eea3..7413c30 100755
--- a/core/sqf/src/seabed/test/go169
+++ b/core/sqf/src/seabed/test/go169
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go17 b/core/sqf/src/seabed/test/go17
index 6aec71f..f13300e 100755
--- a/core/sqf/src/seabed/test/go17
+++ b/core/sqf/src/seabed/test/go17
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go170 b/core/sqf/src/seabed/test/go170
index 8c4bbf4..92993b5 100755
--- a/core/sqf/src/seabed/test/go170
+++ b/core/sqf/src/seabed/test/go170
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go172 b/core/sqf/src/seabed/test/go172
index d3ccbd6..5351ffa 100755
--- a/core/sqf/src/seabed/test/go172
+++ b/core/sqf/src/seabed/test/go172
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go175 b/core/sqf/src/seabed/test/go175
index 01727d2..c813127 100755
--- a/core/sqf/src/seabed/test/go175
+++ b/core/sqf/src/seabed/test/go175
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go176 b/core/sqf/src/seabed/test/go176
index e0e9ad9..2c73a84 100755
--- a/core/sqf/src/seabed/test/go176
+++ b/core/sqf/src/seabed/test/go176
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go178 b/core/sqf/src/seabed/test/go178
index 7009832..91a537d 100755
--- a/core/sqf/src/seabed/test/go178
+++ b/core/sqf/src/seabed/test/go178
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go179 b/core/sqf/src/seabed/test/go179
index 36cf6e9..857c556 100755
--- a/core/sqf/src/seabed/test/go179
+++ b/core/sqf/src/seabed/test/go179
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go18 b/core/sqf/src/seabed/test/go18
index 9c3c4e4..59efb61 100755
--- a/core/sqf/src/seabed/test/go18
+++ b/core/sqf/src/seabed/test/go18
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go180 b/core/sqf/src/seabed/test/go180
index a8f08c4..0b4d148 100755
--- a/core/sqf/src/seabed/test/go180
+++ b/core/sqf/src/seabed/test/go180
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go181 b/core/sqf/src/seabed/test/go181
index 3ed0469..2584880 100755
--- a/core/sqf/src/seabed/test/go181
+++ b/core/sqf/src/seabed/test/go181
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go182 b/core/sqf/src/seabed/test/go182
index 12b84df..9ea072b 100755
--- a/core/sqf/src/seabed/test/go182
+++ b/core/sqf/src/seabed/test/go182
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go183 b/core/sqf/src/seabed/test/go183
index 1e702bb..4b7cddf 100755
--- a/core/sqf/src/seabed/test/go183
+++ b/core/sqf/src/seabed/test/go183
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go184 b/core/sqf/src/seabed/test/go184
index 4d4e6ca..eefc94a 100755
--- a/core/sqf/src/seabed/test/go184
+++ b/core/sqf/src/seabed/test/go184
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go185 b/core/sqf/src/seabed/test/go185
index 19446fa..4c2692f 100755
--- a/core/sqf/src/seabed/test/go185
+++ b/core/sqf/src/seabed/test/go185
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go186 b/core/sqf/src/seabed/test/go186
index 2a69d57..16f4ecf 100755
--- a/core/sqf/src/seabed/test/go186
+++ b/core/sqf/src/seabed/test/go186
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go187 b/core/sqf/src/seabed/test/go187
index 5bdef38..d15d6ab 100755
--- a/core/sqf/src/seabed/test/go187
+++ b/core/sqf/src/seabed/test/go187
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go188 b/core/sqf/src/seabed/test/go188
index 7a22e85..3483437 100755
--- a/core/sqf/src/seabed/test/go188
+++ b/core/sqf/src/seabed/test/go188
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go19 b/core/sqf/src/seabed/test/go19
index 46f1bed..937d5ef 100755
--- a/core/sqf/src/seabed/test/go19
+++ b/core/sqf/src/seabed/test/go19
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go190 b/core/sqf/src/seabed/test/go190
index 0ab0954..1bd2aa8 100755
--- a/core/sqf/src/seabed/test/go190
+++ b/core/sqf/src/seabed/test/go190
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go192 b/core/sqf/src/seabed/test/go192
index ef5736b..7955476 100755
--- a/core/sqf/src/seabed/test/go192
+++ b/core/sqf/src/seabed/test/go192
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go194 b/core/sqf/src/seabed/test/go194
index b480b62..0a3767f 100755
--- a/core/sqf/src/seabed/test/go194
+++ b/core/sqf/src/seabed/test/go194
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go195 b/core/sqf/src/seabed/test/go195
index c8508c1..0e76d65 100755
--- a/core/sqf/src/seabed/test/go195
+++ b/core/sqf/src/seabed/test/go195
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go197 b/core/sqf/src/seabed/test/go197
index 269c71a..427a525 100755
--- a/core/sqf/src/seabed/test/go197
+++ b/core/sqf/src/seabed/test/go197
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go198 b/core/sqf/src/seabed/test/go198
index 992eb21..2c7555b 100755
--- a/core/sqf/src/seabed/test/go198
+++ b/core/sqf/src/seabed/test/go198
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go2 b/core/sqf/src/seabed/test/go2
index dbce392..6ba6334 100755
--- a/core/sqf/src/seabed/test/go2
+++ b/core/sqf/src/seabed/test/go2
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go20 b/core/sqf/src/seabed/test/go20
index a295b02..7817a1b 100755
--- a/core/sqf/src/seabed/test/go20
+++ b/core/sqf/src/seabed/test/go20
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go201 b/core/sqf/src/seabed/test/go201
index fecef82..b4cd522 100755
--- a/core/sqf/src/seabed/test/go201
+++ b/core/sqf/src/seabed/test/go201
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go202 b/core/sqf/src/seabed/test/go202
index d910bb8..69130c7 100755
--- a/core/sqf/src/seabed/test/go202
+++ b/core/sqf/src/seabed/test/go202
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go205 b/core/sqf/src/seabed/test/go205
index 4ed7b23..0ced7e4 100755
--- a/core/sqf/src/seabed/test/go205
+++ b/core/sqf/src/seabed/test/go205
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go206 b/core/sqf/src/seabed/test/go206
index 0e5abaf..04ef284b 100755
--- a/core/sqf/src/seabed/test/go206
+++ b/core/sqf/src/seabed/test/go206
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go207 b/core/sqf/src/seabed/test/go207
index c287671..430c917 100755
--- a/core/sqf/src/seabed/test/go207
+++ b/core/sqf/src/seabed/test/go207
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go208 b/core/sqf/src/seabed/test/go208
index 454b898..85b5acf 100755
--- a/core/sqf/src/seabed/test/go208
+++ b/core/sqf/src/seabed/test/go208
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go209 b/core/sqf/src/seabed/test/go209
index d0cb140..1aa6650 100755
--- a/core/sqf/src/seabed/test/go209
+++ b/core/sqf/src/seabed/test/go209
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go209sh b/core/sqf/src/seabed/test/go209sh
index 2787ac6..70f181d 100755
--- a/core/sqf/src/seabed/test/go209sh
+++ b/core/sqf/src/seabed/test/go209sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go209vc b/core/sqf/src/seabed/test/go209vc
index 7002e63..738cd1d 100755
--- a/core/sqf/src/seabed/test/go209vc
+++ b/core/sqf/src/seabed/test/go209vc
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go21 b/core/sqf/src/seabed/test/go21
index 9651437..254286e 100755
--- a/core/sqf/src/seabed/test/go21
+++ b/core/sqf/src/seabed/test/go21
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go210 b/core/sqf/src/seabed/test/go210
index c4b54af..b4861a6 100755
--- a/core/sqf/src/seabed/test/go210
+++ b/core/sqf/src/seabed/test/go210
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go212 b/core/sqf/src/seabed/test/go212
index 4166ba2..97d9cdf 100755
--- a/core/sqf/src/seabed/test/go212
+++ b/core/sqf/src/seabed/test/go212
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go213 b/core/sqf/src/seabed/test/go213
index 6e6bd73..dbd6b1b 100755
--- a/core/sqf/src/seabed/test/go213
+++ b/core/sqf/src/seabed/test/go213
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go214 b/core/sqf/src/seabed/test/go214
index 7482cfb..11836c7 100755
--- a/core/sqf/src/seabed/test/go214
+++ b/core/sqf/src/seabed/test/go214
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go215 b/core/sqf/src/seabed/test/go215
index 304da41..db9e0c4 100755
--- a/core/sqf/src/seabed/test/go215
+++ b/core/sqf/src/seabed/test/go215
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go216 b/core/sqf/src/seabed/test/go216
index 172d86e..29fd4b5 100755
--- a/core/sqf/src/seabed/test/go216
+++ b/core/sqf/src/seabed/test/go216
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go217 b/core/sqf/src/seabed/test/go217
index c360e4e..af66200 100755
--- a/core/sqf/src/seabed/test/go217
+++ b/core/sqf/src/seabed/test/go217
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go219 b/core/sqf/src/seabed/test/go219
index 6567f5a..13021ed 100755
--- a/core/sqf/src/seabed/test/go219
+++ b/core/sqf/src/seabed/test/go219
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go22 b/core/sqf/src/seabed/test/go22
index 7326fba..f4a4f3f 100755
--- a/core/sqf/src/seabed/test/go22
+++ b/core/sqf/src/seabed/test/go22
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go220 b/core/sqf/src/seabed/test/go220
index b56ff6c..3f60e2e 100755
--- a/core/sqf/src/seabed/test/go220
+++ b/core/sqf/src/seabed/test/go220
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go221 b/core/sqf/src/seabed/test/go221
index 9f26c4d..90837df 100755
--- a/core/sqf/src/seabed/test/go221
+++ b/core/sqf/src/seabed/test/go221
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go222 b/core/sqf/src/seabed/test/go222
index 8f5ad53..1089d1b 100755
--- a/core/sqf/src/seabed/test/go222
+++ b/core/sqf/src/seabed/test/go222
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go226 b/core/sqf/src/seabed/test/go226
index 7a2f6fa..0c1cd46 100755
--- a/core/sqf/src/seabed/test/go226
+++ b/core/sqf/src/seabed/test/go226
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go227 b/core/sqf/src/seabed/test/go227
index e9d154f..1aeb2e0 100755
--- a/core/sqf/src/seabed/test/go227
+++ b/core/sqf/src/seabed/test/go227
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go228 b/core/sqf/src/seabed/test/go228
index bf6dd20..43b1dcb 100755
--- a/core/sqf/src/seabed/test/go228
+++ b/core/sqf/src/seabed/test/go228
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go229 b/core/sqf/src/seabed/test/go229
index f66dbf0..68ce9ba 100755
--- a/core/sqf/src/seabed/test/go229
+++ b/core/sqf/src/seabed/test/go229
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go23 b/core/sqf/src/seabed/test/go23
index 4fdecf2..41e9382 100755
--- a/core/sqf/src/seabed/test/go23
+++ b/core/sqf/src/seabed/test/go23
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go231 b/core/sqf/src/seabed/test/go231
index 0c69499..2e24038 100755
--- a/core/sqf/src/seabed/test/go231
+++ b/core/sqf/src/seabed/test/go231
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go233 b/core/sqf/src/seabed/test/go233
index b6ce85b..dad2ddc 100755
--- a/core/sqf/src/seabed/test/go233
+++ b/core/sqf/src/seabed/test/go233
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go234 b/core/sqf/src/seabed/test/go234
index 14feca0..d28c85f 100755
--- a/core/sqf/src/seabed/test/go234
+++ b/core/sqf/src/seabed/test/go234
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go235 b/core/sqf/src/seabed/test/go235
index c7406ce..df34334 100755
--- a/core/sqf/src/seabed/test/go235
+++ b/core/sqf/src/seabed/test/go235
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go238 b/core/sqf/src/seabed/test/go238
index 736db91..13749cb 100755
--- a/core/sqf/src/seabed/test/go238
+++ b/core/sqf/src/seabed/test/go238
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go24 b/core/sqf/src/seabed/test/go24
index 4454184..579943a 100755
--- a/core/sqf/src/seabed/test/go24
+++ b/core/sqf/src/seabed/test/go24
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go240 b/core/sqf/src/seabed/test/go240
index c71b4ae..11d5c70 100755
--- a/core/sqf/src/seabed/test/go240
+++ b/core/sqf/src/seabed/test/go240
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go241 b/core/sqf/src/seabed/test/go241
index 7b0d4d2..48c023d 100755
--- a/core/sqf/src/seabed/test/go241
+++ b/core/sqf/src/seabed/test/go241
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go242 b/core/sqf/src/seabed/test/go242
index 7e68f68..16b3d25 100755
--- a/core/sqf/src/seabed/test/go242
+++ b/core/sqf/src/seabed/test/go242
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go245 b/core/sqf/src/seabed/test/go245
index 7428d75..8a424fb 100755
--- a/core/sqf/src/seabed/test/go245
+++ b/core/sqf/src/seabed/test/go245
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go246 b/core/sqf/src/seabed/test/go246
index a104c8b..ebf104e 100755
--- a/core/sqf/src/seabed/test/go246
+++ b/core/sqf/src/seabed/test/go246
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go247 b/core/sqf/src/seabed/test/go247
index 8a08fc8..00f5276 100755
--- a/core/sqf/src/seabed/test/go247
+++ b/core/sqf/src/seabed/test/go247
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go249 b/core/sqf/src/seabed/test/go249
index 8fef408..f4fbb6e 100755
--- a/core/sqf/src/seabed/test/go249
+++ b/core/sqf/src/seabed/test/go249
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go25 b/core/sqf/src/seabed/test/go25
index 3b35858..a5841e5 100755
--- a/core/sqf/src/seabed/test/go25
+++ b/core/sqf/src/seabed/test/go25
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go250 b/core/sqf/src/seabed/test/go250
index 4e6f879..85f9fe6 100755
--- a/core/sqf/src/seabed/test/go250
+++ b/core/sqf/src/seabed/test/go250
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go253 b/core/sqf/src/seabed/test/go253
index 6dc398b..4e447bc 100755
--- a/core/sqf/src/seabed/test/go253
+++ b/core/sqf/src/seabed/test/go253
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go254 b/core/sqf/src/seabed/test/go254
index c086ae1..50aeced 100755
--- a/core/sqf/src/seabed/test/go254
+++ b/core/sqf/src/seabed/test/go254
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go26 b/core/sqf/src/seabed/test/go26
index c897dec..e7d4d92 100755
--- a/core/sqf/src/seabed/test/go26
+++ b/core/sqf/src/seabed/test/go26
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go261 b/core/sqf/src/seabed/test/go261
index 8ae7cee..ae4e766 100755
--- a/core/sqf/src/seabed/test/go261
+++ b/core/sqf/src/seabed/test/go261
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go262 b/core/sqf/src/seabed/test/go262
index b73d977..1baa284 100755
--- a/core/sqf/src/seabed/test/go262
+++ b/core/sqf/src/seabed/test/go262
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go264 b/core/sqf/src/seabed/test/go264
index bd2f9ca..d4238e3 100755
--- a/core/sqf/src/seabed/test/go264
+++ b/core/sqf/src/seabed/test/go264
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go266 b/core/sqf/src/seabed/test/go266
index 59729a7..c35ceec 100755
--- a/core/sqf/src/seabed/test/go266
+++ b/core/sqf/src/seabed/test/go266
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go27 b/core/sqf/src/seabed/test/go27
index ebc9a8c..b29ac0b 100755
--- a/core/sqf/src/seabed/test/go27
+++ b/core/sqf/src/seabed/test/go27
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go277 b/core/sqf/src/seabed/test/go277
index 997f1b6..54f7948 100755
--- a/core/sqf/src/seabed/test/go277
+++ b/core/sqf/src/seabed/test/go277
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go279 b/core/sqf/src/seabed/test/go279
index 39e34f1..0f027cc 100755
--- a/core/sqf/src/seabed/test/go279
+++ b/core/sqf/src/seabed/test/go279
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go280 b/core/sqf/src/seabed/test/go280
index c114b54..de3794e 100755
--- a/core/sqf/src/seabed/test/go280
+++ b/core/sqf/src/seabed/test/go280
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go281 b/core/sqf/src/seabed/test/go281
index 7ecefe9..2fb68d6 100755
--- a/core/sqf/src/seabed/test/go281
+++ b/core/sqf/src/seabed/test/go281
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go282 b/core/sqf/src/seabed/test/go282
index 61d8eb5..5b003a6 100755
--- a/core/sqf/src/seabed/test/go282
+++ b/core/sqf/src/seabed/test/go282
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go284 b/core/sqf/src/seabed/test/go284
index 286957d..5a22f0b 100755
--- a/core/sqf/src/seabed/test/go284
+++ b/core/sqf/src/seabed/test/go284
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go284j b/core/sqf/src/seabed/test/go284j
index 277d9dc..24ff91b 100755
--- a/core/sqf/src/seabed/test/go284j
+++ b/core/sqf/src/seabed/test/go284j
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go285 b/core/sqf/src/seabed/test/go285
index dd7b433..fc2692f 100755
--- a/core/sqf/src/seabed/test/go285
+++ b/core/sqf/src/seabed/test/go285
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go3 b/core/sqf/src/seabed/test/go3
index 302edc2..b444c07 100755
--- a/core/sqf/src/seabed/test/go3
+++ b/core/sqf/src/seabed/test/go3
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go30 b/core/sqf/src/seabed/test/go30
index f204ac0..1bd2104 100755
--- a/core/sqf/src/seabed/test/go30
+++ b/core/sqf/src/seabed/test/go30
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go31 b/core/sqf/src/seabed/test/go31
index f180690..84142c8 100755
--- a/core/sqf/src/seabed/test/go31
+++ b/core/sqf/src/seabed/test/go31
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go32x b/core/sqf/src/seabed/test/go32x
index ed4c303..ab51868 100755
--- a/core/sqf/src/seabed/test/go32x
+++ b/core/sqf/src/seabed/test/go32x
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go35 b/core/sqf/src/seabed/test/go35
index 222f482..5e96dc7 100755
--- a/core/sqf/src/seabed/test/go35
+++ b/core/sqf/src/seabed/test/go35
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go36 b/core/sqf/src/seabed/test/go36
index cfa4230..a07d06b 100755
--- a/core/sqf/src/seabed/test/go36
+++ b/core/sqf/src/seabed/test/go36
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go37 b/core/sqf/src/seabed/test/go37
index 26a11a2..e90842a 100755
--- a/core/sqf/src/seabed/test/go37
+++ b/core/sqf/src/seabed/test/go37
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go38 b/core/sqf/src/seabed/test/go38
index 8ac8b37..9456909 100755
--- a/core/sqf/src/seabed/test/go38
+++ b/core/sqf/src/seabed/test/go38
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go39 b/core/sqf/src/seabed/test/go39
index c118f58..0e60f70 100755
--- a/core/sqf/src/seabed/test/go39
+++ b/core/sqf/src/seabed/test/go39
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go4 b/core/sqf/src/seabed/test/go4
index 1c179d1..25e938d 100755
--- a/core/sqf/src/seabed/test/go4
+++ b/core/sqf/src/seabed/test/go4
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go40 b/core/sqf/src/seabed/test/go40
index aac9760..6207ed6 100755
--- a/core/sqf/src/seabed/test/go40
+++ b/core/sqf/src/seabed/test/go40
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go41 b/core/sqf/src/seabed/test/go41
index 13cbef8..7a7c0f4 100755
--- a/core/sqf/src/seabed/test/go41
+++ b/core/sqf/src/seabed/test/go41
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go42 b/core/sqf/src/seabed/test/go42
index 993afa1..28ed96c 100755
--- a/core/sqf/src/seabed/test/go42
+++ b/core/sqf/src/seabed/test/go42
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go43 b/core/sqf/src/seabed/test/go43
index 867458a..001b30c 100755
--- a/core/sqf/src/seabed/test/go43
+++ b/core/sqf/src/seabed/test/go43
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go44 b/core/sqf/src/seabed/test/go44
index 551b2a6..e356f82 100755
--- a/core/sqf/src/seabed/test/go44
+++ b/core/sqf/src/seabed/test/go44
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go45 b/core/sqf/src/seabed/test/go45
index 218bf35..c58da6b 100755
--- a/core/sqf/src/seabed/test/go45
+++ b/core/sqf/src/seabed/test/go45
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go48 b/core/sqf/src/seabed/test/go48
index 705be44..626b20c 100755
--- a/core/sqf/src/seabed/test/go48
+++ b/core/sqf/src/seabed/test/go48
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go49 b/core/sqf/src/seabed/test/go49
index 3fb1393..11d8572 100755
--- a/core/sqf/src/seabed/test/go49
+++ b/core/sqf/src/seabed/test/go49
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go5 b/core/sqf/src/seabed/test/go5
index e74cd21..99525ab 100755
--- a/core/sqf/src/seabed/test/go5
+++ b/core/sqf/src/seabed/test/go5
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go50 b/core/sqf/src/seabed/test/go50
index bfad78a..0999733 100755
--- a/core/sqf/src/seabed/test/go50
+++ b/core/sqf/src/seabed/test/go50
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go51 b/core/sqf/src/seabed/test/go51
index 80f2cc4..96d8c8f 100755
--- a/core/sqf/src/seabed/test/go51
+++ b/core/sqf/src/seabed/test/go51
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go52 b/core/sqf/src/seabed/test/go52
index f8285eb..fe5feba 100755
--- a/core/sqf/src/seabed/test/go52
+++ b/core/sqf/src/seabed/test/go52
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go53 b/core/sqf/src/seabed/test/go53
index 25c83ce..e03aa55 100755
--- a/core/sqf/src/seabed/test/go53
+++ b/core/sqf/src/seabed/test/go53
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go54 b/core/sqf/src/seabed/test/go54
index 98eca54..debaee2 100755
--- a/core/sqf/src/seabed/test/go54
+++ b/core/sqf/src/seabed/test/go54
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go55 b/core/sqf/src/seabed/test/go55
index b9bd4a8..03615c4 100755
--- a/core/sqf/src/seabed/test/go55
+++ b/core/sqf/src/seabed/test/go55
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go56 b/core/sqf/src/seabed/test/go56
index 4702de7..b499c6c 100755
--- a/core/sqf/src/seabed/test/go56
+++ b/core/sqf/src/seabed/test/go56
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go57 b/core/sqf/src/seabed/test/go57
index 085adc9..defa97f 100755
--- a/core/sqf/src/seabed/test/go57
+++ b/core/sqf/src/seabed/test/go57
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go58 b/core/sqf/src/seabed/test/go58
index 131e711..88f8ff1 100755
--- a/core/sqf/src/seabed/test/go58
+++ b/core/sqf/src/seabed/test/go58
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go6 b/core/sqf/src/seabed/test/go6
index 28ab99e..86725cc 100755
--- a/core/sqf/src/seabed/test/go6
+++ b/core/sqf/src/seabed/test/go6
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go61 b/core/sqf/src/seabed/test/go61
index 67e2a44..82c8d3c 100755
--- a/core/sqf/src/seabed/test/go61
+++ b/core/sqf/src/seabed/test/go61
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go62 b/core/sqf/src/seabed/test/go62
index 3c617b9..a2a0d66 100755
--- a/core/sqf/src/seabed/test/go62
+++ b/core/sqf/src/seabed/test/go62
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go63 b/core/sqf/src/seabed/test/go63
index 692486e..ea289ca 100755
--- a/core/sqf/src/seabed/test/go63
+++ b/core/sqf/src/seabed/test/go63
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go64 b/core/sqf/src/seabed/test/go64
index 404caa1..a80780e 100755
--- a/core/sqf/src/seabed/test/go64
+++ b/core/sqf/src/seabed/test/go64
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go65 b/core/sqf/src/seabed/test/go65
index 40cb1b8..2f97fb4 100755
--- a/core/sqf/src/seabed/test/go65
+++ b/core/sqf/src/seabed/test/go65
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go66 b/core/sqf/src/seabed/test/go66
index 0b3b442..68c5334 100755
--- a/core/sqf/src/seabed/test/go66
+++ b/core/sqf/src/seabed/test/go66
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go67 b/core/sqf/src/seabed/test/go67
index 5cd7a42..f5ffea9 100755
--- a/core/sqf/src/seabed/test/go67
+++ b/core/sqf/src/seabed/test/go67
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go68 b/core/sqf/src/seabed/test/go68
index 26da072..168f915 100755
--- a/core/sqf/src/seabed/test/go68
+++ b/core/sqf/src/seabed/test/go68
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go69 b/core/sqf/src/seabed/test/go69
index ede3597..ef3b499 100755
--- a/core/sqf/src/seabed/test/go69
+++ b/core/sqf/src/seabed/test/go69
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go69mon b/core/sqf/src/seabed/test/go69mon
index fa94d9c..d0c43ca 100755
--- a/core/sqf/src/seabed/test/go69mon
+++ b/core/sqf/src/seabed/test/go69mon
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go70 b/core/sqf/src/seabed/test/go70
index fd1798c..6527f2b 100755
--- a/core/sqf/src/seabed/test/go70
+++ b/core/sqf/src/seabed/test/go70
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go71 b/core/sqf/src/seabed/test/go71
index edda55c..1af16b1 100755
--- a/core/sqf/src/seabed/test/go71
+++ b/core/sqf/src/seabed/test/go71
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go72 b/core/sqf/src/seabed/test/go72
index e52f9a1..2977caf 100755
--- a/core/sqf/src/seabed/test/go72
+++ b/core/sqf/src/seabed/test/go72
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go73 b/core/sqf/src/seabed/test/go73
index a66cb96..10c7441 100755
--- a/core/sqf/src/seabed/test/go73
+++ b/core/sqf/src/seabed/test/go73
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go74 b/core/sqf/src/seabed/test/go74
index a5ca320..76ba32e 100755
--- a/core/sqf/src/seabed/test/go74
+++ b/core/sqf/src/seabed/test/go74
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go75 b/core/sqf/src/seabed/test/go75
index 153a9f4..620df54 100755
--- a/core/sqf/src/seabed/test/go75
+++ b/core/sqf/src/seabed/test/go75
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go76 b/core/sqf/src/seabed/test/go76
index ec86d02..4ec2d1b 100755
--- a/core/sqf/src/seabed/test/go76
+++ b/core/sqf/src/seabed/test/go76
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go77 b/core/sqf/src/seabed/test/go77
index 490449e..ce55dac 100755
--- a/core/sqf/src/seabed/test/go77
+++ b/core/sqf/src/seabed/test/go77
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go78 b/core/sqf/src/seabed/test/go78
index 04be508..580a619 100755
--- a/core/sqf/src/seabed/test/go78
+++ b/core/sqf/src/seabed/test/go78
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go79 b/core/sqf/src/seabed/test/go79
index 7be3ee7..dcb02f5 100755
--- a/core/sqf/src/seabed/test/go79
+++ b/core/sqf/src/seabed/test/go79
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go80 b/core/sqf/src/seabed/test/go80
index d24076d..142b769 100755
--- a/core/sqf/src/seabed/test/go80
+++ b/core/sqf/src/seabed/test/go80
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go83 b/core/sqf/src/seabed/test/go83
index d7258d5..900757b 100755
--- a/core/sqf/src/seabed/test/go83
+++ b/core/sqf/src/seabed/test/go83
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go84 b/core/sqf/src/seabed/test/go84
index bbf3837..fa96248 100755
--- a/core/sqf/src/seabed/test/go84
+++ b/core/sqf/src/seabed/test/go84
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go85 b/core/sqf/src/seabed/test/go85
index 9c2d6ff..01352d6 100755
--- a/core/sqf/src/seabed/test/go85
+++ b/core/sqf/src/seabed/test/go85
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go86 b/core/sqf/src/seabed/test/go86
index 413c8f3..7320286 100755
--- a/core/sqf/src/seabed/test/go86
+++ b/core/sqf/src/seabed/test/go86
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go88 b/core/sqf/src/seabed/test/go88
index 3cd8eca..4f3af60 100755
--- a/core/sqf/src/seabed/test/go88
+++ b/core/sqf/src/seabed/test/go88
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go9 b/core/sqf/src/seabed/test/go9
index 6f37f8c..6fcee08 100755
--- a/core/sqf/src/seabed/test/go9
+++ b/core/sqf/src/seabed/test/go9
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go91 b/core/sqf/src/seabed/test/go91
index 5ca1cae..40b1896 100755
--- a/core/sqf/src/seabed/test/go91
+++ b/core/sqf/src/seabed/test/go91
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go92 b/core/sqf/src/seabed/test/go92
index 7da39a9..3afc2af 100755
--- a/core/sqf/src/seabed/test/go92
+++ b/core/sqf/src/seabed/test/go92
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go93 b/core/sqf/src/seabed/test/go93
index da8389a..68ada92 100755
--- a/core/sqf/src/seabed/test/go93
+++ b/core/sqf/src/seabed/test/go93
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go94 b/core/sqf/src/seabed/test/go94
index 7256d1f..9a6e5e0 100755
--- a/core/sqf/src/seabed/test/go94
+++ b/core/sqf/src/seabed/test/go94
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go95 b/core/sqf/src/seabed/test/go95
index 2522b47..b252991 100755
--- a/core/sqf/src/seabed/test/go95
+++ b/core/sqf/src/seabed/test/go95
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go96 b/core/sqf/src/seabed/test/go96
index bae23d7..a9fcffb 100755
--- a/core/sqf/src/seabed/test/go96
+++ b/core/sqf/src/seabed/test/go96
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/go98 b/core/sqf/src/seabed/test/go98
index 6ebedd6..b3c44ba 100755
--- a/core/sqf/src/seabed/test/go98
+++ b/core/sqf/src/seabed/test/go98
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goall b/core/sqf/src/seabed/test/goall
index 0c213e7..81528ea 100755
--- a/core/sqf/src/seabed/test/goall
+++ b/core/sqf/src/seabed/test/goall
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -22,6 +22,7 @@
 
 export TEST_STOP=test.status
 export SQ_MON_ENV_CHECK_DISABLE=1
+#export MON_DTM_PRIMITIVE_DISABLE=1
 cluster=
 if [ "$1" = "-cluster" ]; then
 	cluster=-cluster
@@ -84,7 +85,7 @@
 go49 $cluster $verbose
 go50 $cluster $verbose
 go50 $cluster $verbose -nowait
-go51 $cluster $verbose
+####go51 $cluster $verbose
 go52 $cluster $verbose
 ####go53 $cluster $verbose
 ####go53 $cluster $verbose -nowait
@@ -139,7 +140,7 @@
 go102 $cluster $verbose
 go103 $cluster $verbose
 go104 $cluster $verbose
-go105 $cluster $verbose
+####go105 $cluster $verbose
 go115 $cluster $verbose
 go118 $cluster $verbose
 go118 $cluster $verbose -nowait
@@ -254,13 +255,13 @@
 go241 $cluster $verbose
 go242 $cluster $verbose
 go245 $cluster $verbose
-go246 $cluster $verbose
+####go246 $cluster $verbose
 ####go249 $cluster $verbose
 ####go250 $cluster $verbose
 go253 $cluster $verbose
-go261 $cluster $verbose
+####go261 $cluster $verbose
 go262 $cluster $verbose
-go266 $cluster $verbose
+####go266 $cluster $verbose
 go277 $cluster $verbose
 go277 $cluster $verbose -thread
 go277 $cluster $verbose -thread -any1
diff --git a/core/sqf/src/seabed/test/gocleandb b/core/sqf/src/seabed/test/gocleandb
index a53e65e..c968744 100755
--- a/core/sqf/src/seabed/test/gocleandb
+++ b/core/sqf/src/seabed/test/gocleandb
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 
 # @@@ START COPYRIGHT @@@
 #
diff --git a/core/sqf/src/seabed/test/gocleanport b/core/sqf/src/seabed/test/gocleanport
index be97a4c..efc10af 100755
--- a/core/sqf/src/seabed/test/gocleanport
+++ b/core/sqf/src/seabed/test/gocleanport
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/godb b/core/sqf/src/seabed/test/godb
index 33ea172..ae0cd1b 100755
--- a/core/sqf/src/seabed/test/godb
+++ b/core/sqf/src/seabed/test/godb
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goend b/core/sqf/src/seabed/test/goend
index b45ce3f..bc96a2f 100755
--- a/core/sqf/src/seabed/test/goend
+++ b/core/sqf/src/seabed/test/goend
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/gofilter b/core/sqf/src/seabed/test/gofilter
index 8c807c6..ba274aa 100755
--- a/core/sqf/src/seabed/test/gofilter
+++ b/core/sqf/src/seabed/test/gofilter
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/gohost b/core/sqf/src/seabed/test/gohost
index 682b76f..dad2078 100755
--- a/core/sqf/src/seabed/test/gohost
+++ b/core/sqf/src/seabed/test/gohost
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/gokill b/core/sqf/src/seabed/test/gokill
index 4324946..ee6e86f 100755
--- a/core/sqf/src/seabed/test/gokill
+++ b/core/sqf/src/seabed/test/gokill
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goloop b/core/sqf/src/seabed/test/goloop
index 516917c..c071471 100755
--- a/core/sqf/src/seabed/test/goloop
+++ b/core/sqf/src/seabed/test/goloop
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/gosetup b/core/sqf/src/seabed/test/gosetup
index 619606b..5d11f87 100755
--- a/core/sqf/src/seabed/test/gosetup
+++ b/core/sqf/src/seabed/test/gosetup
@@ -19,6 +19,8 @@
 #
 # @@@ END COPYRIGHT @@@
 
+export MON_DTM_PRIMITIVE_DISABLE=1
+
 cluster=
 if [ "$1" = "-cluster" ]; then
 	cluster=-cluster
diff --git a/core/sqf/src/seabed/test/gosetupdown b/core/sqf/src/seabed/test/gosetupdown
index bacf963..16bf73b 100755
--- a/core/sqf/src/seabed/test/gosetupdown
+++ b/core/sqf/src/seabed/test/gosetupdown
@@ -21,3 +21,4 @@
 
 # setup for node-down
 export SQ_IC=TCP
+export MON_DTM_PRIMITIVE_DISABLE=1
diff --git a/core/sqf/src/seabed/test/goshell b/core/sqf/src/seabed/test/goshell
index d11b71b..c5465c4 100755
--- a/core/sqf/src/seabed/test/goshell
+++ b/core/sqf/src/seabed/test/goshell
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell100 b/core/sqf/src/seabed/test/goshell100
index 76e9007..6b6441c 100755
--- a/core/sqf/src/seabed/test/goshell100
+++ b/core/sqf/src/seabed/test/goshell100
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell102 b/core/sqf/src/seabed/test/goshell102
index 40f4741..235b3c2 100755
--- a/core/sqf/src/seabed/test/goshell102
+++ b/core/sqf/src/seabed/test/goshell102
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell124 b/core/sqf/src/seabed/test/goshell124
index d11f474..dcbd079 100755
--- a/core/sqf/src/seabed/test/goshell124
+++ b/core/sqf/src/seabed/test/goshell124
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell149 b/core/sqf/src/seabed/test/goshell149
index 5961192..5ed318a 100755
--- a/core/sqf/src/seabed/test/goshell149
+++ b/core/sqf/src/seabed/test/goshell149
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell153 b/core/sqf/src/seabed/test/goshell153
index 78f4c37..5243968 100755
--- a/core/sqf/src/seabed/test/goshell153
+++ b/core/sqf/src/seabed/test/goshell153
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell166 b/core/sqf/src/seabed/test/goshell166
index d59af26..727aacb 100755
--- a/core/sqf/src/seabed/test/goshell166
+++ b/core/sqf/src/seabed/test/goshell166
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell176 b/core/sqf/src/seabed/test/goshell176
index d2f97d5..1fdf5d1 100755
--- a/core/sqf/src/seabed/test/goshell176
+++ b/core/sqf/src/seabed/test/goshell176
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell18 b/core/sqf/src/seabed/test/goshell18
index e5786d0..d4ec61e 100755
--- a/core/sqf/src/seabed/test/goshell18
+++ b/core/sqf/src/seabed/test/goshell18
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell205 b/core/sqf/src/seabed/test/goshell205
index 76d0cdc..489c719 100755
--- a/core/sqf/src/seabed/test/goshell205
+++ b/core/sqf/src/seabed/test/goshell205
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell261 b/core/sqf/src/seabed/test/goshell261
index 7f693e0..4b23027 100755
--- a/core/sqf/src/seabed/test/goshell261
+++ b/core/sqf/src/seabed/test/goshell261
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell284 b/core/sqf/src/seabed/test/goshell284
index ee8abbe..445262d 100755
--- a/core/sqf/src/seabed/test/goshell284
+++ b/core/sqf/src/seabed/test/goshell284
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell31 b/core/sqf/src/seabed/test/goshell31
index e550dfa..a6e20e1 100755
--- a/core/sqf/src/seabed/test/goshell31
+++ b/core/sqf/src/seabed/test/goshell31
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell31d b/core/sqf/src/seabed/test/goshell31d
index 1d79d1e..2e4e16b 100755
--- a/core/sqf/src/seabed/test/goshell31d
+++ b/core/sqf/src/seabed/test/goshell31d
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell31vc b/core/sqf/src/seabed/test/goshell31vc
index 2d00f31..b859658 100755
--- a/core/sqf/src/seabed/test/goshell31vc
+++ b/core/sqf/src/seabed/test/goshell31vc
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell3c b/core/sqf/src/seabed/test/goshell3c
index 71c553d..75c61b0 100755
--- a/core/sqf/src/seabed/test/goshell3c
+++ b/core/sqf/src/seabed/test/goshell3c
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell3vc b/core/sqf/src/seabed/test/goshell3vc
index b2d3970..ab31c2e 100755
--- a/core/sqf/src/seabed/test/goshell3vc
+++ b/core/sqf/src/seabed/test/goshell3vc
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell49 b/core/sqf/src/seabed/test/goshell49
index 084872d..109375e 100755
--- a/core/sqf/src/seabed/test/goshell49
+++ b/core/sqf/src/seabed/test/goshell49
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell51 b/core/sqf/src/seabed/test/goshell51
index 64904c7..24bba22 100755
--- a/core/sqf/src/seabed/test/goshell51
+++ b/core/sqf/src/seabed/test/goshell51
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell64 b/core/sqf/src/seabed/test/goshell64
index d8fdfc4..72aefa8 100755
--- a/core/sqf/src/seabed/test/goshell64
+++ b/core/sqf/src/seabed/test/goshell64
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell76 b/core/sqf/src/seabed/test/goshell76
index f02b1c0..e3a4e76 100755
--- a/core/sqf/src/seabed/test/goshell76
+++ b/core/sqf/src/seabed/test/goshell76
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell81 b/core/sqf/src/seabed/test/goshell81
index a93c87c..b6d228a 100755
--- a/core/sqf/src/seabed/test/goshell81
+++ b/core/sqf/src/seabed/test/goshell81
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell84 b/core/sqf/src/seabed/test/goshell84
index c70cb2d..aa8d57a 100755
--- a/core/sqf/src/seabed/test/goshell84
+++ b/core/sqf/src/seabed/test/goshell84
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell93 b/core/sqf/src/seabed/test/goshell93
index 4848784..8cf2a19 100755
--- a/core/sqf/src/seabed/test/goshell93
+++ b/core/sqf/src/seabed/test/goshell93
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshell96 b/core/sqf/src/seabed/test/goshell96
index d2916f7..4dd7d0e 100755
--- a/core/sqf/src/seabed/test/goshell96
+++ b/core/sqf/src/seabed/test/goshell96
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshellafilter b/core/sqf/src/seabed/test/goshellafilter
index 6925073..5be29c9 100755
--- a/core/sqf/src/seabed/test/goshellafilter
+++ b/core/sqf/src/seabed/test/goshellafilter
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshellcg b/core/sqf/src/seabed/test/goshellcg
index 9c1bca9..dce9d96 100755
--- a/core/sqf/src/seabed/test/goshellcg
+++ b/core/sqf/src/seabed/test/goshellcg
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshellcp b/core/sqf/src/seabed/test/goshellcp
index 3ee8092..43c62b6 100755
--- a/core/sqf/src/seabed/test/goshellcp
+++ b/core/sqf/src/seabed/test/goshellcp
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshelleg b/core/sqf/src/seabed/test/goshelleg
index 563f110..59737c6 100755
--- a/core/sqf/src/seabed/test/goshelleg
+++ b/core/sqf/src/seabed/test/goshelleg
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshelleg1 b/core/sqf/src/seabed/test/goshelleg1
index 7b3797c..6dc3cc8 100755
--- a/core/sqf/src/seabed/test/goshelleg1
+++ b/core/sqf/src/seabed/test/goshelleg1
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshellfilter b/core/sqf/src/seabed/test/goshellfilter
index 52eca06..fd78443 100755
--- a/core/sqf/src/seabed/test/goshellfilter
+++ b/core/sqf/src/seabed/test/goshellfilter
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshellm b/core/sqf/src/seabed/test/goshellm
index df381bb..35c01e3 100755
--- a/core/sqf/src/seabed/test/goshellm
+++ b/core/sqf/src/seabed/test/goshellm
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshelln b/core/sqf/src/seabed/test/goshelln
index 025c84e..767c6b4 100755
--- a/core/sqf/src/seabed/test/goshelln
+++ b/core/sqf/src/seabed/test/goshelln
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshellnp b/core/sqf/src/seabed/test/goshellnp
index 039590a..51143b2 100755
--- a/core/sqf/src/seabed/test/goshellnp
+++ b/core/sqf/src/seabed/test/goshellnp
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/goshellsetup b/core/sqf/src/seabed/test/goshellsetup
index 8b8ca17..d8e5054 100755
--- a/core/sqf/src/seabed/test/goshellsetup
+++ b/core/sqf/src/seabed/test/goshellsetup
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/gostart b/core/sqf/src/seabed/test/gostart
index 1c025b5..e415751 100755
--- a/core/sqf/src/seabed/test/gostart
+++ b/core/sqf/src/seabed/test/gostart
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/lunmgr b/core/sqf/src/seabed/test/lunmgr
index 8cdd2b6..35428d3 100755
--- a/core/sqf/src/seabed/test/lunmgr
+++ b/core/sqf/src/seabed/test/lunmgr
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/ms.env b/core/sqf/src/seabed/test/ms.env
index 167dab5..b069be5 100644
--- a/core/sqf/src/seabed/test/ms.env
+++ b/core/sqf/src/seabed/test/ms.env
@@ -26,7 +26,7 @@
 #MS_ASSERT_CHK_SEND=1
 #MS_SONAR=0
 # - trace flags
-MS_TRACE_ENABLE=0
+#MS_TRACE_ENABLE=1
 #MS_TRACE=1
 #MS_TRACE_ALLOC=1
 #MS_TRACE_DATA=1
diff --git a/core/sqf/src/seabed/test/t187ms.cpp b/core/sqf/src/seabed/test/t187ms.cpp
index 04f8a1c..6bf77ae 100644
--- a/core/sqf/src/seabed/test/t187ms.cpp
+++ b/core/sqf/src/seabed/test/t187ms.cpp
@@ -139,9 +139,15 @@
                               BUFSIZ);        // bytecount
         util_check("XMSG_READDATA_", ferr);
         msg = (MS_Mon_Msg *) recv_buffer;
+#if 0
         assert(msg->type == MS_MsgType_ProcessDeath);
         printf("server name=%s received death message for %s\n",
                my_name, msg->u.death.process_name);
+#else
+        assert(msg->type == MS_MsgType_Shutdown);
+        printf("server name=%s received shutdown message\n",
+               my_name);
+#endif
         XMSG_REPLY_(sre.sre_msgId,  // msgid
                     NULL,           // replyctrl
                     0,              // replyctrlsize
diff --git a/core/sqf/src/seabed/test/t209ms.cpp b/core/sqf/src/seabed/test/t209ms.cpp
index d3bc7c2..0944d8e 100644
--- a/core/sqf/src/seabed/test/t209ms.cpp
+++ b/core/sqf/src/seabed/test/t209ms.cpp
@@ -122,10 +122,22 @@
                                                  core_file);
             TEST_CHK_FEOK(ferr);
             printf("core-file=%s\n", core_file);
-            err = stat(core_file, &statbuf);
+            char *pch;
+            pch= strtok (core_file,":");
+            pch = strtok (NULL,":");
+            printf("pch=%s\n", pch);
+            err = stat(pch, &statbuf);
+            int error = errno;
+            printf("stat() failed! - err=%d errno=%d (%s)\n"
+                  , err
+                  , error
+                  , strerror(error) );
             assert(err == 0);
             if (!save)
-                unlink(core_file);
+            {
+                printf("Removing core-file=%s\n", core_file);
+                unlink(pch);
+            }
             if ((loop > 1) && sleepv)
                 sleep(1);
         }
diff --git a/core/sqf/src/seabed/test/t238cmd b/core/sqf/src/seabed/test/t238cmd
index 2dd67c9..455af47 100755
--- a/core/sqf/src/seabed/test/t238cmd
+++ b/core/sqf/src/seabed/test/t238cmd
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seabed/test/t279ms.cpp b/core/sqf/src/seabed/test/t279ms.cpp
index 7dc9349..a835d01 100644
--- a/core/sqf/src/seabed/test/t279ms.cpp
+++ b/core/sqf/src/seabed/test/t279ms.cpp
@@ -545,10 +545,14 @@
             }
 
             printf("core-file=%s\n", core_file);
-            err = stat(core_file, &statbuf);
+            char *pch;
+            pch= strtok (core_file,":");
+            pch = strtok (NULL,":");
+            printf("pch=%s\n", pch);
+            err = stat(pch, &statbuf);
             assert(err == 0);
             if (!save)
-                unlink(core_file);
+                unlink(pch);
             if ((loop > 1) && sleepv)
                 sleep(1);
         }
diff --git a/core/sqf/src/seabed/test/t287ms.cpp b/core/sqf/src/seabed/test/t287ms.cpp
new file mode 100644
index 0000000..dd4423f
--- /dev/null
+++ b/core/sqf/src/seabed/test/t287ms.cpp
@@ -0,0 +1,60 @@
+//------------------------------------------------------------------
+//
+// @@@ START COPYRIGHT @@@
+//
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// @@@ END COPYRIGHT @@@
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "seabed/fserr.h"
+#include "seabed/ms.h"
+#include "seabed/pctl.h"
+#include "seabed/pevents.h"
+
+#include "tchkfe.h"
+#include "tms.h"
+#include "tmsfsutil.h"
+#include "tutil.h"
+#include "tutilp.h"
+
+
+int main(int argc, char *argv[]) {
+    int        ferr;
+    int        cid = -1;
+    int        iid = -1;
+
+    ferr = msg_init(&argc, &argv);
+    TEST_CHK_FEOK(ferr);
+    ferr = msg_mon_process_startup(false);  // system messages?
+    TEST_CHK_FEOK(ferr);
+
+    ferr = msg_mon_get_instance_id(&cid,&iid);
+    TEST_CHK_FEOK(ferr);
+    printf("Cluster ID:  %d\n", cid);
+    printf("Instance ID: %d\n", iid);
+
+    ferr = msg_mon_process_shutdown();
+    TEST_CHK_FEOK(ferr);
+
+    return 0;
+}
diff --git a/core/sqf/src/seabed/test/xcoredel b/core/sqf/src/seabed/test/xcoredel
index c1d6010..b10a91e 100755
--- a/core/sqf/src/seabed/test/xcoredel
+++ b/core/sqf/src/seabed/test/xcoredel
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -23,7 +23,7 @@
 cores=`ls core.*`
 
 for core in $cores; do
-	file $core|egrep "('/bin/sh |'go|'mpirun |'sh |'shell |'tcheck |'monitor'|'pstartd |'tee |'watchdog')" > /dev/null
+	file $core|egrep "('/bin/bash |'go|'mpirun |'sh |'shell |'tcheck |'monitor'|'pstartd |'tee |'watchdog')" > /dev/null
 	if [ $? = 0 ]; then
 		file $core
 		rm -f $core
diff --git a/core/sqf/src/seabed/test/xkm b/core/sqf/src/seabed/test/xkm
index f01e77f..86a67ee 100755
--- a/core/sqf/src/seabed/test/xkm
+++ b/core/sqf/src/seabed/test/xkm
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/core/sqf/src/seatrans/tm/hbasetmlib2/idtm.cpp b/core/sqf/src/seatrans/tm/hbasetmlib2/idtm.cpp
index 924b1cb..c8d681d 100644
--- a/core/sqf/src/seatrans/tm/hbasetmlib2/idtm.cpp
+++ b/core/sqf/src/seatrans/tm/hbasetmlib2/idtm.cpp
@@ -35,6 +35,7 @@
 #include "seabed/ms.h"
 #include "seabed/pctl.h"
 #include "seabed/pevents.h"
+#include "seabed/thread.h"
 
 #include "idtmsrv.h"
 
diff --git a/core/sqf/src/seatrans/tm/hbasetmlib2/idtmclicom.h b/core/sqf/src/seatrans/tm/hbasetmlib2/idtmclicom.h
index 53da50d..94e2ebb 100644
--- a/core/sqf/src/seatrans/tm/hbasetmlib2/idtmclicom.h
+++ b/core/sqf/src/seatrans/tm/hbasetmlib2/idtmclicom.h
@@ -419,10 +419,40 @@
     int lv_ferr;
     int lv_tmpcount;
 
-    lv_ferr = msg_mon_get_process_info_type(MS_ProcessType_TMID,
-                                            pp_count,
-                                            0,      // max
-                                            NULL);  // info
+    // Added retries to get the 'idtm' process info. 
+    // Reason: The DTM process is now a primitive process and
+    // the monitor starts it up right at the outset. At that point 
+    // the 'idtm' may not be up. The TM (java) code wants to get the
+    // ID from the idtm server during its initialisation.
+
+    int lv_Retries = 0;
+    int lc_maxRetries = 200;
+    int lc_Pause = 3000; // 3 seconds
+
+    do {
+
+      if (lv_Retries > 0) {
+        if (gv_verbose) {
+            printf("cli: do_get_servers, retry#%d, going to sleep for %d ms.\n", lv_Retries, lc_Pause);
+        }
+        SB_Thread::Sthr::sleep(lc_Pause); // in msec
+      }
+
+      lv_ferr = msg_mon_get_process_info_type(MS_ProcessType_TMID,
+                                              pp_count,
+                                              0,      // max
+                                              NULL);  // info
+      if (gv_verbose) {
+        printf("cli: do_get_servers process type TMID err=%d, num_servers=%d\n", lv_ferr, *pp_count);
+      }
+
+      lv_Retries++;
+    }
+    while ((lv_Retries <= lc_maxRetries) && 
+           ((lv_ferr != XZFIL_ERR_OK) ||
+            (*pp_count <= 0)))
+      ;
+
     if (lv_ferr == XZFIL_ERR_OK) {
         *ppp_pi = new MS_Mon_Process_Info_Type[*pp_count];
         lv_ferr = msg_mon_get_process_info_type(MS_ProcessType_TMID,
diff --git a/core/sqf/src/tm/Makefile b/core/sqf/src/tm/Makefile
index f0adc6e..18af54b 100644
--- a/core/sqf/src/tm/Makefile
+++ b/core/sqf/src/tm/Makefile
@@ -93,7 +93,6 @@
 		      $(OUTDIR)/tmmap.o \
 		      $(OUTDIR)/tmmutex.o \
 		      $(OUTDIR)/tmdeque.o \
-		      $(OUTDIR)/tmsync.o \
 		      $(OUTDIR)/tmtxkey.o \
 		      $(OUTDIR)/tmrecovstate.o \
 		      $(OUTDIR)/tmpoolelement.o \
@@ -126,6 +125,13 @@
 		      $(OUTDIR)/tmxarmmain.o \
 		      $(OUTDIR)/tm.o \
 		      $(OUTDIR)/CommonLogger.o  
+
+ifeq ($(SUPPORT_TM_SYNC),1)
+DEFINES         += -DSUPPORT_TM_SYNC
+TMOBJS          += $(OUTDIR)/tmsync.o
+endif
+
+
 IDSRVOBJS	= $(OUTDIR)/idtmsrv.o \
 		      $(OUTDIR)/versidtmsrv.o
 ALLOBJS	= $(LIBSTMOBJS) \
diff --git a/core/sqf/src/tm/tm.cpp b/core/sqf/src/tm/tm.cpp
index e812cab..c35ca0d 100644
--- a/core/sqf/src/tm/tm.cpp
+++ b/core/sqf/src/tm/tm.cpp
@@ -1837,6 +1837,7 @@
 
 } //tm_get_leader_info
 
+#ifdef SUPPORT_TM_SYNC
 //---------------------------------------------------------------------
 // tm_originating_sync_commit
 // Purpose - helper method to process the phase2 sync from the 
@@ -2099,7 +2100,7 @@
      }
      TMTrace(2, ("tm_originating_sync_abort EXIT\n"));
 }
-
+#endif
 
 // ---------------------------------------------------------------------------
 // tm_process_node_down_msg
@@ -2552,8 +2553,12 @@
         tm_log_event(DTM_NODEUP, SQ_LOG_INFO, "DTM_NODEUP", 
             -1,-1,gv_tm_info.nid(),-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
            NULL,lv_msg.u.up.nid);
+        if (gv_tm_info.lead_tm()) {
+          gv_tm_info.open_restarted_tm(lv_msg.u.up.nid);
+        }
         break;
     }
+#if 0
     case MS_MsgType_NodePrepare:
     {
         TMTrace(1, ("tm_process_monitor_msg NodePrepare notice for nid %d\n", lv_msg.u.prepare.nid));
@@ -2586,6 +2591,7 @@
         }
         break;
     }
+#endif
     case MS_MsgType_ProcessDeath:
     {
         TMTrace(3, ("tm_process_monitor_msg Process Death notice for %s\n", 
@@ -2723,6 +2729,7 @@
                      NULL);          /*newphandle*/
          break;
     }
+#ifdef SUPPORT_TM_SYNC
     case MS_MsgType_TmSyncAbort:
     {
         // There can be many monitor replies, so circle through them all
@@ -2770,8 +2777,9 @@
         }
     break;
     }
+#endif
     case MS_MsgType_Event:
-    case MS_MsgType_UnsolicitedMessage:
+//    case MS_MsgType_UnsolicitedMessage:
     default:
     {
          break;
@@ -2803,6 +2811,8 @@
     MESSAGE_HEADER_SQ     *lp_msg_hdr;
     CTmTxMessage          *lp_msg;
 
+    static bool           sv_schedule_init_and_recover_rms_called = false;
+
     TMTrace(2, ("tm_process_msg ENTRY\n"));
 
     if((unsigned)(pp_sre->sre_reqDataSize) > (sizeof(Tm_Req_Msg_Type))){
@@ -2987,14 +2997,17 @@
           }
           else
           {
-             if (lp_cp_req->iv_startup)
+            if (! sv_schedule_init_and_recover_rms_called)
              {
-                 TMTrace(3, ("tm_process_msg : Control Point startup from Lead TM nid %d.\n",
+                 TMTrace(3, ("tm_process_msg : Control Point request (first) from the Lead TM nid %d.\n",
                          lp_cp_req->iv_sending_tm_nid));
                  gv_tm_info.schedule_init_and_recover_rms();
+                 sv_schedule_init_and_recover_rms_called = true;
              }
              else
              {
+                 TMTrace(3, ("tm_process_msg : Control Point request (subsequent) from the Lead TM nid %d.\n",
+                         lp_cp_req->iv_sending_tm_nid));
                  gv_system_tx_count = 0;
                  gv_tm_info.write_all_trans_state();
              }
@@ -3225,7 +3238,7 @@
 // ----------------------------------------------------------------
 void tm_shutdown_helper ()
 {
-    TMTrace(2, ("tm_shutdown_helper ENTRY\n"));
+    TMTrace(2, ("tm_shutdown_helper ENTRY, num of active transactions:%d\n",gv_tm_info.num_active_txs()));
 
     if (gv_tm_info.num_active_txs() <= 0)
     {
@@ -3245,6 +3258,31 @@
    TMTrace(2, ("tm_shutdown_helper EXIT\n"));
 }
 
+//
+// The 'TM Ready' message (to the monitor) is generated 
+// by a non Lead DTM when it is started.
+//
+// The monitor generates the 'Node UP' message when it 
+// receives the 'TM Ready' message. 
+//
+// The 'Node UP' message causes the lead DTM to (re)connect
+// with this DTM.
+//
+bool generate_tm_ready_if_necessary()
+{
+ 
+  if (gv_tm_info.lead_tm()) {
+    TMTrace(2, ("generate_tm_ready_if_necessary, returning as I am the lead().\n"));
+    return false;
+  }
+ 
+  TMTrace(2, ("generate_tm_ready_if_necessary - going to call:msg_mon_tm_ready\n"));
+  msg_mon_tm_ready();
+  TMTrace(2, ("generate_tm_ready_if_necessary - back from call:msg_mon_tm_ready\n"));
+ 
+  return true;
+ 
+}
 // ---------------------------------------------------------------
 // tm_main_initialize
 // Purpose - call all initialization routines
@@ -3268,11 +3306,13 @@
                       &lv_leader_pid, la_leader_name);
     gv_tm_info.lead_tm_nid(lv_leader_nid);
 
+    TMTrace(1, ("tm_main_initialize - lead dtm node id:%d\n", lv_leader_nid)); 
     if (lv_leader_nid < 0  || lv_leader_nid >= MAX_NODES)
     {
         tm_log_event(DTM_TM_LEADTM_BAD, SQ_LOG_CRIT, "DTM_TM_LEADTM_BAD",
                      -1, -1, gv_tm_info.nid(), -1, -1, -1, -1, -1, -1, -1, 
                      -1, -1, -1, -1, -1, -1, NULL, lv_leader_nid);
+        TMTrace(1, ("tm_main_initialize - bad lead dtm node id:%d\n", lv_leader_nid)); 
         abort();
     }
     if (lv_leader_nid == gv_tm_info.nid())
@@ -3303,7 +3343,11 @@
         //The AM and TSE events will be implemented in the future.
         //msg_mon_event_wait (AM_TLOG_FIXUP_COMPLETED_EVENT_ID, &lv_event_len, la_event_data);
         //msg_mon_event_wait (TSE_START_EVENT_ID, &lv_event_len, la_event_data);
-        msg_mon_event_wait (DTM_START_EVENT_ID, &lv_event_len, la_event_data);
+        if (gv_tm_info.incarnation_num() == 0) {
+          TMTrace(2, ("tm_main_initialize - waiting for the start event\n")); 
+          msg_mon_event_wait (DTM_START_EVENT_ID, &lv_event_len, la_event_data);
+          TMTrace(2, ("tm_main_initialize - got the start event\n")); 
+        }
     }
 
      //Start timer thread
@@ -3390,7 +3434,7 @@
     msg_mon_process_startup(true); // server?
     msg_debug_hook ("tm.hook", "tm.hook");
     tm_init_logging();
-    msg_mon_tmsync_register(tm_sync_cb);
+//    msg_mon_tmsync_register(tm_sync_cb);
     msg_mon_enable_mon_messages (1);
     msg_enable_priority_queue();
     // allow the DTM to use all the message descriptors
@@ -3398,6 +3442,18 @@
     XCONTROLMESSAGESYSTEM(XCTLMSGSYS_SETSENDLIMIT,SEABED_MAX_SETTABLE_SENDLIMIT_TM);
     tm_main_initialize();
 
+    TMTrace(1, ("thread_main - back from tm_main_initialize.\n"));
+
+    // Added this block when removing the TM sync mechanism
+    gv_tm_info.set_txnsvc_ready(TXNSVC_UP);
+    gv_tm_info.can_takeover(true);
+
+    bool lv_tm_ready_generated = generate_tm_ready_if_necessary();
+
+    if (! gv_tm_info.lead_tm() && !lv_tm_ready_generated) {
+      gv_tm_info.schedule_init_and_recover_rms();
+    }
+
     for(;;) 
     {
         int lv_msg_count = 0;
diff --git a/core/sqf/src/tm/tminfo.cpp b/core/sqf/src/tm/tminfo.cpp
index 6947a49..c6456b5 100644
--- a/core/sqf/src/tm/tminfo.cpp
+++ b/core/sqf/src/tm/tminfo.cpp
@@ -1297,7 +1297,7 @@
 
     if (!all_rms_closed()) {
        // RMs open, no need to re-open
-       TMTrace (2, ("TM_Info::init_and_recover_rms : EXIT.\n"));
+       TMTrace (2, ("TM_Info::init_and_recover_rms - all RMs open: EXIT.\n"));
        return;
     }
 
@@ -1454,6 +1454,7 @@
     // once recovery is done, write 2 control points for a clean slate
     if (lead_tm())
     {
+        TMTrace(2, ("TM_Info::init_and_recover_rms : lead_dtm.\n"));
         // If this is the Lead TM, proceeds with system crash recovery.
         // Once recovery is done, the recover_system function will 
         // write 2 control points for a clean slate.  
@@ -1463,16 +1464,24 @@
            // send out recovery start sync.  The rest of system recover will be driven
            // from the completion
            ClusterRecov(new TM_Recov(gv_tm_info.rm_wait_time()));
+#if 0
            lv_error = ClusterRecov()->initiate_start_sync();
+#endif
+           gv_tm_info.set_sys_recov_status(TM_SYS_RECOV_STATE_END,
+                                           gv_tm_info.lead_tm_nid()); //my node sb the lead tm
+           gv_tm_info.schedule_recover_system();      
         }
+	tm_up();
     }
     else
     {
        // Mark the TM up now if it's pending.
+       TMTrace(2, ("TM_Info::init_and_recover_rms : not the lead_dtm.\n"));
+       gv_tm_info.set_sys_recov_status(TM_SYS_RECOV_STATE_END,lead_tm_nid());
        tm_up();
        if (restarting_tm() == nid())
        {
-          gv_tm_info.set_sys_recov_status(TM_SYS_RECOV_STATE_END,lead_tm_nid());
+          TMTrace(2, ("TM_Info::init_and_recover_rms : Calling msg_mon_tm_ready.\n"));
           msg_mon_tm_ready();
           restarting_tm(-1);
        }
@@ -1490,6 +1499,12 @@
 //           message is received by the monitor. It will restart the
 //           TM process immediately and send a sync to update the 
 //           restarted process.
+//
+// Update: This method will NOT be called by the lead DTM as
+//         the 'pstartd' (process startup daemon)will start up 
+//         the TM on it's node.
+//
+//         Keeping this method for the time being.
 // -------------------------------------------------------------------
 int32 TM_Info::restart_tm_process(int32 pv_nid) 
 {
@@ -1949,7 +1964,7 @@
 
     gv_tm_info.node_being_recovered (pv_nid, nid());
 
-    send_takeover_tm_sync(TM_RECOVERY_START, nid(), pv_nid);
+    //send_takeover_tm_sync(TM_RECOVERY_START, nid(), pv_nid);
                
     lv_error = msg_mon_get_node_info2(&lv_node_count, MAX_NODES, 
                                       NULL, &lv_lnode_count, NULL, NULL, NULL);
@@ -2006,7 +2021,7 @@
 // ---------------------------------------------------------------------------
 void TM_Info::set_recovery_end(int32 pv_nid)
 {
-     bool  lv_success = true;
+     bool  lv_success = false;
 
      TMTrace(2, ("TM_Info::set_recovery_end ENTRY, Node %d\n",pv_nid));
 
@@ -2019,6 +2034,7 @@
     if (mode() == TM_NONSYNC_MODE)
     {
         TMTrace(2, ("TM_Info::set_recovery_end (NONSYNC MODE)\n"));
+	lv_success = true;
         // Nothing to do!
     }
     else
@@ -2027,9 +2043,7 @@
          lv_success = do_take_over(lp_dataList);
     }
 
-    if (lv_success)
-        send_takeover_tm_sync(TM_RECOVERY_END, gv_tm_info.nid(), pv_nid);
-    else
+    if (! lv_success)
     {
         tm_log_event(DTM_TM_TAKEOVER_FAILED, SQ_LOG_CRIT, "DTM_TM_TAKEOVER_FAILED",
                      -1, -1, pv_nid);
@@ -2088,13 +2102,13 @@
            continue;
 
         // send sync here for start of sync
-        send_takeover_tm_sync(TM_RECOVERY_START, nid(), lv_info[lv_inx].nid);
+        //send_takeover_tm_sync(TM_RECOVERY_START, nid(), lv_info[lv_inx].nid);
     }
 
     delete [] lv_info;
 
     // process has been restarted, set to up and recover rms
-    send_tm_process_restart_sync(nid(), pv_nid);
+    //send_tm_process_restart_sync(nid(), pv_nid);
 
     send_tm_state_information();
 
@@ -2120,9 +2134,11 @@
             (iv_recovery[lv_index].iv_list_built == true))
 
         {
+#ifdef SUPPORT_TM_SYNC
             send_state_resync (iv_nid, iv_recovery[lv_index].iv_down_without_sync, 
                                iv_recovery[lv_index].iv_node_being_recovered,
                                iv_recovery[lv_index].iv_list_built, lv_index);
+#endif
             TMTrace(2, ("TM_Info::send_tm_state_information sent state for index %d.\n", lv_index));
         }
     }
@@ -3546,7 +3562,7 @@
    if (!iv_lead_tm)
       return (FEOK);
    
-   TMTrace (2, ("TM_Info::open_restarted_tm : ENTRY.\n"));
+   TMTrace (2, ("TM_Info::open_restarted_tm, nid:%d : ENTRY.\n", pv_nid));
   
    sprintf(la_buffer, "$tm%d", pv_nid);
    lock();
@@ -3577,7 +3593,7 @@
 //    SB_Thread::Sthr::sleep(100); // in msec
     dummy_link_to_refresh_phandle(pv_nid); // The second one actually updates the phandle
     
-   TMTrace (2, ("TM_Info::open_restarted_tm : EXIT"));
+   TMTrace (2, ("TM_Info::open_restarted_tm, nid:%d : EXIT.\n", pv_nid));
    return lv_error;
 }
 
@@ -3708,10 +3724,14 @@
             iv_state, iv_sys_recov_state, all_rms_closed()));
 
    // We need to block new transactions while the RMs are still being opened.
-   if (all_rms_closed())
+   if (all_rms_closed()) {
+      TMTrace(2, ("TM_Info::tm_up. Still waiting for RMS\n"));
       state(TM_STATE_WAITING_RM_OPEN);
-   else
+   }
+   else {
+      TMTrace(2, ("TM_Info::tm_up. Setting state to UP\n"));
       state(TM_STATE_UP);
+   }
 
    if (iv_sys_recov_state == TM_SYS_RECOV_STATE_END)
       wake_TMUP_waiters(FEOK);
@@ -3843,12 +3863,20 @@
                             (char *) la_tm_name, (char *) DTM_NEXT_SEQNUM_BLOCK, 
                             la_seq_num);
 
+      TMTrace (2, ("TM_Info::setNextSeqNumBlock : proc:%s, seqnum block:%s.\n",
+                   la_tm_name, 
+                   la_seq_num));
       if (lv_error == 0)
          lv_startSeqNum = (unsigned int) strtoul((char *) &la_seq_num, &lp_stop, 10);
+
+      TMTrace (2, ("TM_Info::setNextSeqNumBlock : proc:%s, seqnum block:%s, startseqnum: %d.\n",
+                   la_tm_name, 
+                   la_seq_num, 
+                   lv_startSeqNum));
    }
    else
       lv_startSeqNum = iv_nextSeqNum;
-   
+  
    // Check for sequence number wraparound
    if (lv_startSeqNum >= MAX_SEQNUM)
    {
diff --git a/core/sqf/src/tm/tmlib.cpp b/core/sqf/src/tm/tmlib.cpp
index f7b9a7d..c849746 100644
--- a/core/sqf/src/tm/tmlib.cpp
+++ b/core/sqf/src/tm/tmlib.cpp
@@ -2968,11 +2968,15 @@
     short     la_results[6];
     short     lv_ret = FEOK;
     int32     lv_linkRetries = 0;
+    int32     lv_breakRetries = 0;
     const int32 lc_maxLinkRetries = 100;
     const int32 lc_linkPause = 3000; // 3 second
+    const int32 lc_maxBreakRetries = 60;
+    const int32 lc_breakPause = 3000; // 3 second
 
     TMlibTrace(("TMLIB_TRACE : send_tm (node %d) ENTRY\n", pv_node), 2);
 
+ retry_on_fepathdown:
     if (!gv_tmlib.open_tm(pv_node))
     {
          TMlibTrace(("TMLIB_TRACE : returning FETMFNOTRUNNING\n"), 1);
@@ -3021,6 +3025,12 @@
     switch (lv_ret)
     {
     case FEPATHDOWN:
+       lv_breakRetries++;
+       if (lv_breakRetries <= lc_maxBreakRetries) {
+         SB_Thread::Sthr::sleep(lc_breakPause); // in msec
+         TMlibTrace(("TMLIB_TRACE : send_tm , retry after BMSG_BREAK error: FEPATHDOWN\n"), 3);
+         goto retry_on_fepathdown;
+       }
        lv_ret = FETMFNOTRUNNING;
        break;
     case FESERVICEDISABLED:
diff --git a/core/sqf/src/tm/tmrecov.cpp b/core/sqf/src/tm/tmrecov.cpp
index b3eb222..db05cd0 100644
--- a/core/sqf/src/tm/tmrecov.cpp
+++ b/core/sqf/src/tm/tmrecov.cpp
@@ -80,7 +80,9 @@
    ip_tm_info->set_sys_recov_status(TM_SYS_RECOV_STATE_START, ip_tm_info->nid());
    
    // Tell all the other TMs that system recovery has started.
+#ifdef SUPPORT_TM_SYNC
    send_sys_recov_start_sync(ip_tm_info->nid());
+#endif
    TMTrace (2, ("TM_Recov::initiate_start_sync EXIT.\n"));
    return lv_error;
 }
@@ -107,7 +109,7 @@
       ip_tm_info->write_control_point(true);
       ip_tm_info->write_control_point(true, true);
       ip_tm_info->set_sys_recov_status(TM_SYS_RECOV_STATE_END, lv_my_nid);
-      send_sys_recov_end_sync(lv_my_nid);
+      //send_sys_recov_end_sync(lv_my_nid);
       lv_error = FEOK;
       return (lv_error);
    }
@@ -189,7 +191,7 @@
    //lv_error = send_xa_recover_toall_TSEs(pv_dtm);
 
    // Sending ID for node down.
-   TMTrace (2, ("TM_Recov::recover_dtm_death: sending node down message"));
+   TMTrace (2, ("TM_Recov::recover_dtm_death: sending node down message.\n"));
    gv_HbaseTM.nodeDown(pv_dtm);
 
    if (lv_error != FEOK)
@@ -208,7 +210,7 @@
    TMTrace(3, ("TM_Recov::recover_dtm_death setting recovery_list_built to TRUE for Node %d\n",pv_dtm));
    ip_tm_info->recovery_list_built (pv_dtm, true);
    listBuilt (true);
-   send_recov_listbuilt_sync (ip_tm_info->nid(), pv_dtm);
+   //send_recov_listbuilt_sync (ip_tm_info->nid(), pv_dtm);
 
    iv_max_txs_to_recover = iv_total_txs_to_recover;
    tm_log_event(DTM_RECOVERY_TXNS_TO_RECOVER, SQ_LOG_NOTICE, "DTM_RECOVERY_TXNS_TO_RECOVER",
@@ -1344,7 +1346,7 @@
    ip_tm_info->write_control_point(true);
    ip_tm_info->write_control_point(true, true);
    ip_tm_info->set_sys_recov_status(TM_SYS_RECOV_STATE_END, ip_tm_info->nid());
-   send_sys_recov_end_sync(ip_tm_info->nid());
+   //send_sys_recov_end_sync(ip_tm_info->nid());
 
    TMTrace (2, ("TM_Recov::completeRecovery EXIT\n"));
 } //TM_Recov::completeRecovery
diff --git a/core/sqf/src/tm/tmtimermain.cpp b/core/sqf/src/tm/tmtimermain.cpp
index 7e82197..c2ef14f 100644
--- a/core/sqf/src/tm/tmtimermain.cpp
+++ b/core/sqf/src/tm/tmtimermain.cpp
@@ -49,6 +49,14 @@
     }
  
    TMTrace(2, ("tmTimer_initiate_cp : ENTRY.\n"));
+   
+   // This will send control point requests to the other TMs
+   if (gv_tm_info.lead_tm()) {
+     TMTrace(2, ("tmTimer_initiate_cp : Calling write_control_point \n"));
+     gv_tm_info.write_control_point(true);
+     gv_tm_info.write_control_point(true, true);
+   }
+
    if (gv_tm_info.use_tlog()) {
 
        // Initiate HBase TM Control Point
diff --git a/core/sqf/src/tm/tmtxbase.cpp b/core/sqf/src/tm/tmtxbase.cpp
index e67ba6e..baa7347 100644
--- a/core/sqf/src/tm/tmtxbase.cpp
+++ b/core/sqf/src/tm/tmtxbase.cpp
@@ -327,9 +327,11 @@
     }
     }; //switch
 
+#ifdef SUPPORT_TM_SYNC
     init_and_send_tx_sync_data( lv_type, pv_state, 
                             (TM_Transid_Type*)&iv_transid, 
                              pv_nid, pv_pid);
+#endif
 
     TMTrace (2, ("CTmTxBase::sync_write : EXIT. TxnId %d, type %d.\n",
                    iv_tag, lv_type));
diff --git a/core/sqf/src/trafconf/Makefile b/core/sqf/src/trafconf/Makefile
index f48a281..59f3f82 100644
--- a/core/sqf/src/trafconf/Makefile
+++ b/core/sqf/src/trafconf/Makefile
@@ -44,7 +44,7 @@
 LIBSX	+= -lrt
 
 # need -lsqlite3 for SQLite
-LIBSX	+=  -lsqlite3 -lpthread
+LIBSX	+=  -lsqlite3
 
 # need -lpthread starting on CentOS7
 LIBSX	+=  -lpthread  
@@ -86,15 +86,15 @@
 	@echo 
 	@echo Building Trafodion Configuration Library
 	@echo 
-	@echo $(CXX) $(LIBSX) $(LNK_FLGS) -shared -o $@ $(OBJTRAFCONFIG)
-	$(CXX) $(LIBSX) $(LNK_FLGS) -shared -o $@ $(OBJTRAFCONFIG)
+	@echo $(CXX) $(LIBSX) -o $@ $(OBJTRAFCONFIG) -shared $(LNK_FLGS) 
+	$(CXX) $(LIBSX) -o $@ $(OBJTRAFCONFIG) -shared $(LNK_FLGS) 
 
 $(BINEXPDIR)/trafconf: $(OBJTRAFCONF)
 	@echo 
 	@echo Building Trafodion Configuration Utility
 	@echo 
-	@echo $(CXX) $(LIBSX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(INCLUDES) -o $@ $(OBJTRAFCONF)
-	@$(CXX) $(LIBSX) $(LIBTRAFCONFIGX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(INCLUDES) -o $@ $(OBJTRAFCONF)
+	@echo $(CXX) $(INCLUDES) -o $@ $(OBJTRAFCONF) $(LIBTRAFCONFIGX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(LIBSX)
+	@$(CXX) $(INCLUDES) -o $@ $(OBJTRAFCONF) $(LIBTRAFCONFIGX) $(CDEPFLAGS) $(FLAGS) $(OPTIONS) $(LIBSX)
 
 stop:
 	-killall shell monitor
diff --git a/core/sqf/src/trafconf/clusterconf.cpp b/core/sqf/src/trafconf/clusterconf.cpp
index d3938ce..4883207 100644
--- a/core/sqf/src/trafconf/clusterconf.cpp
+++ b/core/sqf/src/trafconf/clusterconf.cpp
@@ -50,6 +50,9 @@
               : CPNodeConfigContainer(TC_NODES_MAX)
               , CLNodeConfigContainer(TC_NODES_MAX)
               , configMaster_(-1)
+              , clusterId_(-1)
+              , instanceId_(-1)
+              , isRealCluster_(true)
               , nodeReady_(false)
               , persistReady_(false)
               , newPNodeConfig_(true)
@@ -62,6 +65,41 @@
     const char method_name[] = "CClusterConfig::CClusterConfig";
     TRACE_ENTRY;
 
+    if ( getenv( "SQ_VIRTUAL_NODES" ) )
+    {
+        isRealCluster_ = false;
+    }
+
+    char *env;
+    env = getenv("TRAF_CLUSTER_ID");
+    if ( env && isdigit(*env) )
+    {
+        clusterId_ = atoi(env);
+    }
+    else
+    {
+        char la_buf[TC_LOG_BUF_SIZE];
+        sprintf( la_buf
+               , "[%s], Environment variable TRAF_CLUSTER_ID is undefined, exiting!\n"
+               , method_name);
+        TcLogWrite( MON_CLUSTERCONF_CLUSTERCONFIG_1, TC_LOG_CRIT, la_buf );
+        exit(EXIT_FAILURE);
+    }
+    env = getenv("TRAF_INSTANCE_ID");
+    if ( env && isdigit(*env) )
+    {
+        instanceId_ = atoi(env);
+    }
+    else
+    {
+        char la_buf[TC_LOG_BUF_SIZE];
+        sprintf( la_buf
+               , "[%s], Environment variable TRAF_INSTANCE_ID is undefined, exiting!\n"
+               , method_name);
+        TcLogWrite( MON_CLUSTERCONF_CLUSTERCONFIG_2, TC_LOG_CRIT, la_buf );
+        exit(EXIT_FAILURE);
+    }
+
     memset( &configMasterName_, 0, TC_PROCESSOR_NAME_MAX );
 
     TRACE_EXIT;
@@ -116,11 +154,12 @@
 
     if ( TcTraceSettings & TC_TRACE_INIT )
     {
-        trace_printf( "%s@%d nid=%d, pnid=%d, nodename=%s\n"
+        trace_printf( "%s@%d nid=%d, pnid=%d, nodename=%s, domainname=%s\n"
                     , method_name, __LINE__
                     , lnodeConfigInfo.nid
                     , pnodeConfigInfo.pnid
-                    , pnodeConfigInfo.nodename );
+                    , pnodeConfigInfo.nodename
+                    , pnodeConfigInfo.domainname );
     }
 
     if ( newPNodeConfig_ )
@@ -141,10 +180,11 @@
 
     if ( TcTraceSettings & TC_TRACE_INIT )
     {
-        trace_printf( "%s@%d pnid=%d, nodename=%s\n"
+        trace_printf( "%s@%d pnid=%d, nodename=%s, domainname=%s\n"
                     , method_name, __LINE__
                     , pnodeConfigInfo.pnid
-                    , pnodeConfigInfo.nodename );
+                    , pnodeConfigInfo.nodename
+                    , pnodeConfigInfo.domainname );
     }
 
     if ( newPNodeConfig_ )
@@ -376,50 +416,15 @@
         return( false );
     }
 
-    bool lv_is_real_cluster = true;
-    if ( getenv( "SQ_VIRTUAL_NODES" ) )
-    {
-        lv_is_real_cluster = false;
-    }
-
     // Process logical nodes
     for (int i =0; i < nodeCount; i++ )
     {
-        char *tmpptr = nodeConfigData[i].node_name;
-        while ( *tmpptr )
-        {
-            *tmpptr = (char)tolower( *tmpptr );
-            tmpptr++;
-        }
-    
-        if (lv_is_real_cluster)
-        {
-            // Remove the domain portion of the name if any
-            char short_node_name[TC_PROCESSOR_NAME_MAX];
-            char str1[TC_PROCESSOR_NAME_MAX];
-            memset( str1, 0, TC_PROCESSOR_NAME_MAX );
-            memset( short_node_name, 0, TC_PROCESSOR_NAME_MAX );
-            strcpy (str1, nodeConfigData[i].node_name );
-
-            char *str1_dot = strchr( (char *) str1, '.' );
-            if ( str1_dot )
-            {
-                memcpy( short_node_name, str1, str1_dot - str1 );
-            }
-            else
-            {
-                strcpy (short_node_name, str1 );
-            }
-
-            strcpy(nodeConfigData[i].node_name, short_node_name);
-
-        }
-
         if ( TcTraceSettings & TC_TRACE_INIT )
         {
-            trace_printf( "%s@%d nodename=%s\n"
+            trace_printf( "%s@%d node_name=%s, domain_name=%s\n"
                           , method_name, __LINE__
-                          , nodeConfigData[i].node_name);
+                          , nodeConfigData[i].node_name
+                          , nodeConfigData[i].domain_name );
         }
 
         ProcessLNode( nodeConfigData[i], pnodeConfigInfo, lnodeConfigInfo );
@@ -453,8 +458,10 @@
         AddSNodeConfiguration( pnodeConfigInfo );
     }
 
+    prevPNodeConfig_ = NULL;
+    prevLNodeConfig_ = NULL;
     nodeReady_ = true;
-
+    
     if ( TcTraceSettings & TC_TRACE_INIT )
     {
         if ( nodeReady_ )
@@ -554,12 +561,14 @@
 
     if ( TcTraceSettings & TC_TRACE_INIT )
     {
-        trace_printf( "%s@%d nid=%d, pnid=%d, name=%s, excluded cores=(%d:%d),"
-                      " cores=(%d:%d), processors=%d, roles=%d\n"
+        trace_printf( "%s@%d nid=%d, pnid=%d, name=%s, domain=%s, "
+                      "excluded cores=(%d:%d), cores=(%d:%d), "
+                      "processors=%d, roles=%d\n"
                     , method_name, __LINE__
                     , nodeConfigData.nid
                     , nodeConfigData.pnid
                     , nodeConfigData.node_name
+                    , nodeConfigData.domain_name
                     , nodeConfigData.excluded_first_core
                     , nodeConfigData.excluded_last_core
                     , nodeConfigData.first_core
@@ -578,6 +587,9 @@
         strncpy( pnodeConfigInfo.nodename
                , nodeConfigData.node_name
                , sizeof(pnodeConfigInfo.nodename) );
+        strncpy( pnodeConfigInfo.domainname
+               , nodeConfigData.domain_name
+               , sizeof(pnodeConfigInfo.domainname) );
         pnodeConfigInfo.excludedFirstCore = nodeConfigData.excluded_first_core;
         pnodeConfigInfo.excludedLastCore  = nodeConfigData.excluded_last_core;
         excludedCores = (nodeConfigData.excluded_first_core != -1 || 
@@ -600,6 +612,9 @@
     strncpy( lnodeConfigInfo.nodename
            , nodeConfigData.node_name
            , sizeof(lnodeConfigInfo.nodename) );
+    strncpy( lnodeConfigInfo.domainname
+           , nodeConfigData.domain_name
+           , sizeof(lnodeConfigInfo.domainname) );
     lnodeConfigInfo.firstCore = nodeConfigData.first_core;
     lnodeConfigInfo.lastCore  = nodeConfigData.last_core;
     SetCoreMask( nodeConfigData.first_core
@@ -619,8 +634,8 @@
 
     if ( TcTraceSettings & TC_TRACE_INIT )
     {
-        trace_printf( "%s@%d pnid=%d, name=%s, excluded cores=(%d:%d), "
-                      "spareCount=%d\n"
+        trace_printf( "%s@%d pnid=%d, node_name=%s, "
+                      "excluded cores=(%d:%d), spareCount=%d\n"
                     , method_name, __LINE__
                     , pnodeConfig.pnid
                     , pnodeConfig.node_name
@@ -630,13 +645,18 @@
                     );
     }
 
-    newPNodeConfig_ = (pnodeConfig.pnid != prevPNodeConfig_->GetPNid()) 
+    newPNodeConfig_ = ((prevPNodeConfig_ == NULL) ||
+                       (pnodeConfig.pnid != prevPNodeConfig_->GetPNid()))
                         ? true : false;
     if ( newPNodeConfig_ )
     {
-        strncpy( pnodeConfigInfo.nodename
-               , pnodeConfig.node_name
-               , sizeof(pnodeConfigInfo.nodename) );
+        if ( TcTraceSettings & TC_TRACE_INIT )
+        {
+            trace_printf( "%s@%d node_name=%s, domain_name=%s\n"
+                          , method_name, __LINE__
+                          , pnodeConfigInfo.nodename
+                          , pnodeConfigInfo.domainname );
+        }
 
         bool excludedCores = (pnodeConfig.excluded_first_core != -1 || 
                               pnodeConfig.excluded_last_core != -1)
@@ -661,7 +681,7 @@
 }
 
 void CClusterConfig::ProcessPersistInfo( TcPersistConfiguration_t &persistConfig
-                                       , persistConfigInfo_t     &persistConfigInfo )
+                                       , persistConfigInfo_t      &persistConfigInfo )
 {
     const char method_name[] = "CClusterConfig::ProcessPersistInfo";
     TRACE_ENTRY;
@@ -751,6 +771,7 @@
 }
 
 bool CClusterConfig::SaveNodeConfig( const char *name
+                                   , const char *domain
                                    , int         nid
                                    , int         pnid
                                    , int         firstCore
@@ -771,11 +792,13 @@
 
     if (TcTraceSettings & (TC_TRACE_INIT | TC_TRACE_REQUEST))
     {
-        trace_printf( "%s@%d Saving node config (node_name=%s, processors=%d, "
+        trace_printf( "%s@%d Saving node config "
+                      "(name=%s, domain=%s, processors=%d, "
                       "roles=%d, firstCore=%d, lastCore=%d "
                       "excludedFirstCore=%d, excludedLastCore=%d)\n"
                      , method_name, __LINE__
                      , name
+                     , domain
                      , processors
                      , roles
                      , firstCore
@@ -787,6 +810,7 @@
     nodeConfig.nid  = nid;
     nodeConfig.pnid = pnid;
     strncpy( nodeConfig.node_name, name, sizeof(nodeConfig.node_name) );
+    strncpy( nodeConfig.domain_name, domain, sizeof(nodeConfig.domain_name) );
     nodeConfig.excluded_first_core = excludedFirstCore;
     nodeConfig.excluded_last_core  = excludedLastCore;
     nodeConfig.first_core = firstCore;
@@ -828,6 +852,7 @@
 
 bool CClusterConfig::UpdatePNodeConfig( int         pnid
                                       , const char *name
+                                      , const char *domain
                                       , int         excludedFirstCore
                                       , int         excludedLastCore )
 {
@@ -841,18 +866,27 @@
     if (TcTraceSettings & (TC_TRACE_INIT | TC_TRACE_REQUEST))
     {
         trace_printf( "%s@%d Updating pnode config "
-                      "(pnid=%d, node_name=%s, "
+                      "(pnid=%d, name=%s, domain=%s, "
                       "excludedFirstCore=%d, excludedLastCore=%d)\n"
                      , method_name, __LINE__
                      , pnid
                      , name
+                     , domain
                      , excludedFirstCore
                      , excludedLastCore );
     }
 
     memset( &pnodeConfig, 0, sizeof(TcPhysicalNodeConfiguration_t) );
     pnodeConfig.pnid = pnid;
-    strncpy( pnodeConfig.node_name, name, sizeof(pnodeConfig.node_name) );
+    if (strlen(domain))
+    {
+        snprintf( pnodeConfig.node_name, sizeof(pnodeConfig.node_name)
+                , "%s.%s", name, domain );
+    }
+    else
+    {
+        strncpy( pnodeConfig.node_name, name, sizeof(pnodeConfig.node_name) );
+    }
     pnodeConfig.excluded_first_core = excludedFirstCore;
     pnodeConfig.excluded_last_core  = excludedLastCore;
     
@@ -863,6 +897,7 @@
         // Update physical node to configuration object
         UpdatePNodeConfiguration( pnid
                                 , name
+                                , domain
                                 , excludedFirstCore
                                 , excludedLastCore );
     }
@@ -871,8 +906,8 @@
         rs = false;
         char buf[TC_LOG_BUF_SIZE];
         snprintf( buf, sizeof(buf)
-                , "[%s] PNode update failed, pnid=%d, node_name=%s\n"
-                , method_name,  pnid, name );
+                , "[%s] PNode update failed, pnid=%d, name=%s, domain=%s\n"
+                , method_name,  pnid, name, domain );
         TcLogWrite( MON_CLUSTERCONF_UPDATEPNODECFG_1, TC_LOG_ERR, buf );
     }
 
@@ -882,6 +917,7 @@
 
 void CClusterConfig::UpdatePNodeConfiguration( int         pnid
                                              , const char *name
+                                             , const char *domain
                                              , int         excludedFirstCore
                                              , int         excludedLastCore )
 {
@@ -903,6 +939,7 @@
     if ( pnodeConfig )
     {
         pnodeConfig->SetName( name );
+        pnodeConfig->SetDomain( domain );
         pnodeConfig->SetExcludedFirstCore( excludedFirstCore );
         pnodeConfig->SetExcludedLastCore( excludedLastCore );
     }
diff --git a/core/sqf/src/trafconf/clusterconf.h b/core/sqf/src/trafconf/clusterconf.h
index ff4b17e..1a7196d 100644
--- a/core/sqf/src/trafconf/clusterconf.h
+++ b/core/sqf/src/trafconf/clusterconf.h
@@ -43,8 +43,10 @@
 
     void            Clear( void );
     bool            DeleteNodeConfig( int  pnid );
-    int             GetConfigMaster ( ) { return configMaster_;} 
-    char *          GetConfigMasterByName() {return configMasterName_;} 
+    inline int      GetClusterId( void ) { return clusterId_;} 
+    inline int      GetConfigMaster( void ) { return configMaster_;} 
+    inline char *   GetConfigMasterByName( void ) {return configMasterName_;} 
+    inline int      GetInstanceId( void ) { return instanceId_;} 
     bool            Initialize( void );
     bool            Initialize( bool traceEnabled, const char *traceFile );
     void            InitCoreMask( cpu_set_t &coreMask );
@@ -56,6 +58,7 @@
     bool            LoadNodeConfig( void );
     bool            LoadPersistConfig( void );
     bool            SaveNodeConfig( const char *name
+                                  , const char *domain
                                   , int         nid
                                   , int         pnid
                                   , int         firstCore
@@ -69,6 +72,7 @@
                                , cpu_set_t &coreMask );
     bool            UpdatePNodeConfig( int         pnid
                                      , const char *name
+                                     , const char *domain
                                      , int         excludedFirstCore
                                      , int         excludedLastCore );
 
@@ -76,7 +80,10 @@
 private:
 
     int             configMaster_;
+    int             clusterId_;
+    int             instanceId_;
     char            configMasterName_[TC_PROCESSOR_NAME_MAX];
+    bool            isRealCluster_;
     bool            nodeReady_;    // true when node configuration loaded
     bool            persistReady_; // true when persist configuration loaded
     bool            newPNodeConfig_;
@@ -115,6 +122,7 @@
                            , int         excludedLastCore );
     void  UpdatePNodeConfiguration( int         pnid
                                   , const char *name
+                                  , const char *domain
                                   , int         excludedFirstCore
                                   , int         excludedLastCore );
 };
diff --git a/core/sqf/src/trafconf/lnodeconfig.cpp b/core/sqf/src/trafconf/lnodeconfig.cpp
index 70af102..e6fc8bc 100644
--- a/core/sqf/src/trafconf/lnodeconfig.cpp
+++ b/core/sqf/src/trafconf/lnodeconfig.cpp
@@ -78,6 +78,16 @@
     TRACE_EXIT;
 }
 
+const char *CLNodeConfig::GetDomain( void )
+{
+    return( pnodeConfig_->GetDomain() );
+}
+
+const char *CLNodeConfig::GetFqdn( void )
+{
+    return( pnodeConfig_->GetFqdn() );
+}
+
 const char *CLNodeConfig::GetName( void )
 {
     return( pnodeConfig_->GetName() );
@@ -240,7 +250,20 @@
         return( NULL );
     }
 
-    assert( lnodesConfig_[lnodeConfigInfo.nid] == NULL );
+    if( lnodesConfig_[lnodeConfigInfo.nid] != NULL )
+    {
+        if (TcTraceSettings & (TC_TRACE_INIT | TC_TRACE_REQUEST))
+        {
+            trace_printf( "%s@%d - Existing logical node configuration object\n"
+                          "        (nid=%d, pnid=%d, nextNid_=%d)\n"
+                          "        (lnodesCount_=%d,lnodesConfigMax=%d)\n"
+                        , method_name, __LINE__
+                        , lnodeConfigInfo.nid, pnodeConfig->GetPNid(), nextNid_
+                        , lnodesCount_, lnodesConfigMax_);
+        }
+        TRACE_EXIT;
+        return( lnodesConfig_[lnodeConfigInfo.nid] );
+    }
 
     CLNodeConfig *lnodeConfig = new CLNodeConfig( pnodeConfig
                                                 , lnodeConfigInfo );
diff --git a/core/sqf/src/trafconf/lnodeconfig.h b/core/sqf/src/trafconf/lnodeconfig.h
index e14e5e0..3292ded 100644
--- a/core/sqf/src/trafconf/lnodeconfig.h
+++ b/core/sqf/src/trafconf/lnodeconfig.h
@@ -33,6 +33,7 @@
     int        nid;
     int        pnid;
     char       nodename[TC_PROCESSOR_NAME_MAX];
+    char       domainname[TC_PROCESSOR_NAME_MAX];
     int        firstCore;
     int        lastCore;
     cpu_set_t  coreMask;
@@ -90,6 +91,8 @@
     inline cpu_set_t    &GetCoreMask( void ) { return( coreMask_ ); }
     inline int           GetFirstCore( void ) { return( firstCore_ ); }
     inline int           GetLastCore( void ) { return( lastCore_ ); }
+    const char          *GetDomain( void );
+    const char          *GetFqdn( void );
     const char          *GetName( void );
     inline CLNodeConfig *GetNext( void ) { return( next_); }
     inline CLNodeConfig *GetNextP( void ) { return( nextP_); }
diff --git a/core/sqf/src/trafconf/pnodeconfig.cpp b/core/sqf/src/trafconf/pnodeconfig.cpp
index a47ce56..6f3ee16 100644
--- a/core/sqf/src/trafconf/pnodeconfig.cpp
+++ b/core/sqf/src/trafconf/pnodeconfig.cpp
@@ -60,6 +60,15 @@
     TRACE_ENTRY;
 
     strcpy( name_, pnodeConfigInfo.nodename );
+    strcpy( domain_, pnodeConfigInfo.domainname );
+    if (strlen( domain_ ))
+    {
+        snprintf( fqdn_, sizeof(fqdn_), "%s.%s", name_, domain_ );
+    }
+    else
+    {
+        strncpy( fqdn_, name_, sizeof(fqdn_) );
+    }
     CPU_ZERO( &excludedCoreMask_ );
 
     TRACE_EXIT;
@@ -148,6 +157,14 @@
     TRACE_EXIT;
 }
 
+void CPNodeConfig::SetDomain( const char *newDomain ) 
+{ 
+    if (newDomain) 
+    {
+        strcpy(domain_, newDomain); 
+    }
+} 
+
 void CPNodeConfig::SetName( const char *newName ) 
 { 
     if (newName) 
@@ -269,7 +286,20 @@
         return( NULL );
     }
 
-    assert( pnodesConfig_[pnodeConfigInfo.pnid] == NULL );
+    if( pnodesConfig_[pnodeConfigInfo.pnid] != NULL )
+    {
+        if (TcTraceSettings & (TC_TRACE_INIT | TC_TRACE_REQUEST))
+        {
+            trace_printf( "%s@%d - Existing physical node configuration object\n"
+                          "        (pnid=%d, nextPNid_=%d)\n"
+                          "        (pnodesCount_=%d,pnodesConfigMax=%d)\n"
+                        , method_name, __LINE__
+                        , pnodeConfigInfo.pnid, nextPNid_
+                        , pnodesCount_, pnodesConfigMax_);
+        }
+        TRACE_EXIT;
+        return( pnodesConfig_[pnodeConfigInfo.pnid] );
+    }
 
     CPNodeConfig *pnodeConfig = new CPNodeConfig( this, pnodeConfigInfo );
     if (pnodeConfig)
@@ -457,6 +487,38 @@
     return config;
 }
 
+CPNodeConfig *CPNodeConfigContainer::GetNextPNodeConfigByName( char * nodename )
+{
+    const char method_name[] = "CPNodeConfigContainer::GetNextPNodeConfigByName";
+    TRACE_ENTRY;
+
+    int pnid = -1;
+    CPNodeConfig *config = GetPNodeConfig( nodename );
+    CPNodeConfig *nextConfig = NULL;
+
+    if (config)
+    {
+        pnid = config->GetPNid();
+        // Get the next one if not at the end, else start at zero
+        pnid = ((pnid + 1) < pnodesConfigMax_) ? (pnid + 1) : 0;
+    }
+
+    while (config)
+    {
+        nextConfig = GetPNodeConfig( pnid );
+        if ( !nextConfig )
+        { // Skip the empty entries
+            pnid = ((pnid + 1) < pnodesConfigMax_) ? (pnid + 1) : 0;
+            continue;
+        }
+        config = nextConfig;
+        break;
+    }
+
+    TRACE_EXIT;
+    return config;
+}
+
 void CPNodeConfigContainer::GetSpareNodesConfigSet( const char *name
                                                   , PNodesConfigList_t &spareSet )
 {
diff --git a/core/sqf/src/trafconf/pnodeconfig.h b/core/sqf/src/trafconf/pnodeconfig.h
index f3b05f7..2381cdb 100644
--- a/core/sqf/src/trafconf/pnodeconfig.h
+++ b/core/sqf/src/trafconf/pnodeconfig.h
@@ -40,6 +40,7 @@
 {
     int        pnid;
     char       nodename[TC_PROCESSOR_NAME_MAX];
+    char       domainname[TC_PROCESSOR_NAME_MAX];
     int        excludedFirstCore;
     int        excludedLastCore;
     cpu_set_t  excludedCoreMask;
@@ -59,6 +60,7 @@
     void          DeletePNodeConfig( CPNodeConfig *pnodeConfig );
     inline CPNodeConfig *GetFirstPNodeConfig( void ) { return ( head_ ); }
     inline int    GetNextPNid( void ) { return ( nextPNid_ ); }
+    CPNodeConfig *GetNextPNodeConfigByName( char * name );
     int           GetPNid( char  *nodename );
     CPNodeConfig *GetPNodeConfig( char *nodename );
     CPNodeConfig *GetPNodeConfig( int pnid );
@@ -98,9 +100,12 @@
     inline cpu_set_t    &GetExcludedCoreMask( void ) { return (excludedCoreMask_); }
     inline int           GetExcludedFirstCore( void ) { return ( excludedFirstCore_ ); }
     inline int           GetExcludedLastCore( void ) { return ( excludedLastCore_ ); }
+    inline const char   *GetDomain( void ) { return ( domain_ ); }
+    inline const char   *GetFqdn( void ) { return ( fqdn_ ); }
     inline const char   *GetName( void ) { return ( name_ ); }
     inline CPNodeConfig *GetNext( void ) { return ( next_ ); }
     inline int           GetPNid( void ) { return ( pnid_ ); }
+    void                 SetDomain( const char *newDomain ); 
     void                 SetName( const char *newName ); 
     void                 SetExcludedFirstCore( int excludedFirstCore ); 
     void                 SetExcludedLastCore( int excludedLastCore ); 
@@ -115,7 +120,9 @@
 protected:
 private:
     CPNodeConfigContainer *pnodesConfig_; // physical nodes container
-    char                   name_[TC_PROCESSOR_NAME_MAX]; // hostname
+    char                   name_[TC_PROCESSOR_NAME_MAX]; // short hostname
+    char                   domain_[TC_PROCESSOR_NAME_MAX]; // domain
+    char                   fqdn_[TC_PROCESSOR_NAME_MAX]; // FQDN hostname
     int                    pnid_;         // physical node identifier
     cpu_set_t              excludedCoreMask_; // mask of excluded SMP processor cores
     int                    excludedFirstCore_;// First excluded SMP processor core used by logical node
diff --git a/core/sqf/src/trafconf/tcdbsqlite.cpp b/core/sqf/src/trafconf/tcdbsqlite.cpp
index 43cdc42..0e7dab5 100644
--- a/core/sqf/src/trafconf/tcdbsqlite.cpp
+++ b/core/sqf/src/trafconf/tcdbsqlite.cpp
@@ -2436,10 +2436,48 @@
                     , exclastcore );
     }
 
+    if (TcIsRealCluster)
+    {
+        char short_node_name[TC_PROCESSOR_NAME_MAX];
+        char str1[TC_PROCESSOR_NAME_MAX];
+        char *tmpptr = NULL;
+        tmpptr = (char*)nodename;
+
+        while ( *tmpptr )
+        {
+            *tmpptr = (char)tolower( *tmpptr );
+            tmpptr++;
+        }
+
+        // Extract the domain portion of the name if any
+        memset( str1, 0, TC_PROCESSOR_NAME_MAX );
+        memset( short_node_name, 0, TC_PROCESSOR_NAME_MAX );
+        strcpy (str1, nodename );
+
+        char *str1_dot = strchr( (char *) str1, '.' );
+        if ( str1_dot )
+        {
+            memcpy( short_node_name, str1, str1_dot - str1 );
+            // copy the domain portion and skip the '.'
+            strcpy( spareNodeConfig.node_name, short_node_name );
+            strcpy( spareNodeConfig.domain_name, str1_dot+1 );
+        }
+        else
+        {
+            strncpy( spareNodeConfig.node_name
+                   , nodename
+                   , sizeof(spareNodeConfig.node_name) );
+            spareNodeConfig.domain_name[0] = 0;
+        }
+    }
+    else
+    {
+        strncpy( spareNodeConfig.node_name
+               , nodename
+               , sizeof(spareNodeConfig.node_name) );
+    }
+
     spareNodeConfig.pnid = pnid;
-    strncpy( spareNodeConfig.node_name
-           , nodename
-           , sizeof(spareNodeConfig.node_name) );
     spareNodeConfig.excluded_first_core = excfirstcore;
     spareNodeConfig.excluded_last_core = exclastcore;
 
@@ -3425,11 +3463,49 @@
                     , roles );
     }
 
+    if (TcIsRealCluster)
+    {
+        char short_node_name[TC_PROCESSOR_NAME_MAX];
+        char str1[TC_PROCESSOR_NAME_MAX];
+        char *tmpptr = NULL;
+        tmpptr = (char *)nodename;
+
+        while ( *tmpptr )
+        { // Set to lowercase characters
+            *tmpptr = (char)tolower( *tmpptr );
+            tmpptr++;
+        }
+
+        // Extract the domain portion of the name if any
+        memset( str1, 0, TC_PROCESSOR_NAME_MAX );
+        memset( short_node_name, 0, TC_PROCESSOR_NAME_MAX );
+        strcpy (str1, nodename );
+
+        char *str1_dot = strchr( (char *) str1, '.' );
+        if ( str1_dot )
+        {
+            memcpy( short_node_name, str1, str1_dot - str1 );
+            // copy the domain portion and skip the '.'
+            strcpy( nodeConfig.node_name, short_node_name );
+            strcpy( nodeConfig.domain_name, str1_dot+1 );
+        }
+        else
+        {
+            strncpy( nodeConfig.node_name
+                   , nodename
+                   , sizeof(nodeConfig.node_name) );
+            nodeConfig.domain_name[0] = 0;
+        }
+    }
+    else
+    {
+        strncpy( nodeConfig.node_name
+               , nodename
+               , sizeof(nodeConfig.node_name) );
+    }
+
     nodeConfig.nid  = nid;
     nodeConfig.pnid = pnid;
-    strncpy( nodeConfig.node_name
-           , nodename
-           , sizeof(nodeConfig.node_name) );
     nodeConfig.excluded_first_core = excfirstcore;
     nodeConfig.excluded_last_core = exclastcore;
     nodeConfig.first_core = firstcore;
@@ -3437,6 +3513,24 @@
     nodeConfig.processors = processors;
     nodeConfig.roles  = roles;
 
+    if ( TcTraceSettings & (TC_TRACE_NODE | TC_TRACE_REQUEST) )
+    {
+        trace_printf( "%s@%d nid=%d, pnid=%d, node_name=%s, domain_name=%s, "
+                      "excluded cores=(%d:%d),  cores=(%d:%d), "
+                      "processors=%d, roles=%d\n"
+                    , method_name, __LINE__
+                    , nodeConfig.nid
+                    , nodeConfig.pnid
+                    , nodeConfig.node_name
+                    , nodeConfig.domain_name
+                    , nodeConfig.excluded_first_core
+                    , nodeConfig.excluded_last_core
+                    , nodeConfig.first_core
+                    , nodeConfig.last_core
+                    , nodeConfig.processors
+                    , nodeConfig.roles );
+    }
+
     TRACE_EXIT;
 }
 
@@ -3460,13 +3554,62 @@
                     , exclastcore );
     }
 
+    if (TcIsRealCluster)
+    {
+        char short_node_name[TC_PROCESSOR_NAME_MAX];
+        char str1[TC_PROCESSOR_NAME_MAX];
+        char *tmpptr = NULL;
+        tmpptr = (char *)nodename;
+
+        while ( *tmpptr )
+        {
+            *tmpptr = (char)tolower( *tmpptr );
+            tmpptr++;
+        }
+
+        // Extract the domain portion of the name if any
+        memset( str1, 0, TC_PROCESSOR_NAME_MAX );
+        memset( short_node_name, 0, TC_PROCESSOR_NAME_MAX );
+        strcpy (str1, nodename );
+
+        char *str1_dot = strchr( (char *) str1, '.' );
+        if ( str1_dot )
+        { // Set to lowercase characters
+            memcpy( short_node_name, str1, str1_dot - str1 );
+            // copy the domain portion and skip the '.'
+            strcpy( pnodeConfig.node_name, short_node_name );
+            strcpy( pnodeConfig.domain_name, str1_dot+1 );
+        }
+        else
+        {
+            strncpy( pnodeConfig.node_name
+                   , nodename
+                   , sizeof(pnodeConfig.node_name) );
+            pnodeConfig.domain_name[0] = 0;
+        }
+    }
+    else
+    {
+        strncpy( pnodeConfig.node_name
+               , nodename
+               , sizeof(pnodeConfig.node_name) );
+    }
+
     pnodeConfig.pnid = pnid;
-    strncpy( pnodeConfig.node_name
-           , nodename
-           , sizeof(pnodeConfig.node_name) );
     pnodeConfig.excluded_first_core = excfirstcore;
     pnodeConfig.excluded_last_core = exclastcore;
 
+    if ( TcTraceSettings & (TC_TRACE_NODE | TC_TRACE_REQUEST) )
+    {
+        trace_printf( "%s@%d pnid=%d, node_name=%s, domain_name=%s, excluded cores=(%d:%d)\n"
+                    , method_name, __LINE__
+                    , pnodeConfig.pnid
+                    , pnodeConfig.node_name
+                    , pnodeConfig.domain_name
+                    , pnodeConfig.excluded_first_core
+                    , pnodeConfig.excluded_last_core );
+    }
+
     TRACE_EXIT;
 }
 
diff --git a/core/sqf/src/trafconf/tcdbstore.h b/core/sqf/src/trafconf/tcdbstore.h
index bfe53a1..98b2539 100644
--- a/core/sqf/src/trafconf/tcdbstore.h
+++ b/core/sqf/src/trafconf/tcdbstore.h
@@ -30,6 +30,7 @@
 
 using namespace std;
 
+extern bool TcIsRealCluster;
 
 //
 // Trafodion Configuration Database Adaptor (CTcdbStore class)
diff --git a/core/sqf/src/trafconf/trafconf.cpp b/core/sqf/src/trafconf/trafconf.cpp
index 35994cd..8277727 100644
--- a/core/sqf/src/trafconf/trafconf.cpp
+++ b/core/sqf/src/trafconf/trafconf.cpp
@@ -47,9 +47,15 @@
 
 typedef enum {
     TrafConfType_Undefined=0,         // Invalid
+
+    TrafConfType_ClusterId,           // Display Cluster Id: -clusterid
+    TrafConfType_InstanceId,          // Display Instance Id: -instanceid
+
     TrafConfType_NodeName,            // Display node names: -name -short
     TrafConfType_NodeName_w,          // Display node names: -wname -wshort
     TrafConfType_NodeId,              // Display node ids
+    TrafConfType_MyNodeName,          // Display local node name
+    TrafConfType_MyNodeId,            // Display local node id
     TrafConfType_PhysicalNodeId,      // Display physical node ids
     TrafConfType_ZoneId,              // Display zone ids
     // the above displays values as: "<value-1>  <value-2> ..."
@@ -140,15 +146,19 @@
 void DisplayUsage( void )
 {
     fprintf( stderr, 
-"\nUsage: trafconf { -? | -h | -name | -short | -wname | -wshort | \\\n"
-"                  -nameserver | -ns | -node | -persist | \\\n"
+"\nUsage: trafconf { -? | -h | -cid | -iid | -name | -short | -wname | -wshort | \\\n"
+"                  -myname | -mynid | -nameserver | -ns | -node | -persist | \\\n"
 "                  -node-max | -nid-count | -pnid-count | -spares-count | \\\n"
-"                  --nameserver | --ns | --node | --persist | \\\n"
+"                  --cid | --iid | --name| --short | --wname | --wshort | \\\n"
+"                  --myname | --mynid | --nameserver | --ns | --node | --persist | \\\n"
 "                  --node-max | --nid-count | --pnid-count | --spares-count  }\n"
 "\n   Where:\n"
 "     -?                Displays usage.\n"
 "     -h                Displays usage.\n\n"
 
+"     -cid              Displays cluster id.\n"
+"     -iid              Displays instance id.\n"
+
 "     -name             Displays all node names in configuration.\n"
 "                        - Name is as stored in configuration, which could be in short host name or FQDN form.\n"
 "     -short            Displays all node names in configuration in short host name form.\n"
@@ -156,6 +166,10 @@
 "                        - Name is as stored in configuration, which could be in short host name or FQDN form.\n"
 "     -wshort           Displays all node names in configuration short host name form prefixed with '-w'.\n\n"
 
+"     -myname           Displays local node name in configuration.\n"
+"                        - Name is as stored in configuration, which could be in short host name or FQDN form.\n"
+"     -mynid            Displays local node-id in configuration.\n\n"
+
 "     -nameserver -ns   Displays nameserver configuration (without begin/end brackets).\n"
 "     -node             Displays node configuration (without begin/end brackets).\n"
 "     -persist          Displays persist configuration (without begin/end brackets).\n\n"
@@ -165,6 +179,12 @@
 "     -pnid-count       Displays count of physical-node-id(s) in the configuration.\n"
 "     -spares-count     Displays count of spare physical-node-id(s) in the configuration.\n\n"
 
+"     --cid             Displays cluster id (prefixed with 'Cluster Id:')\n"
+"     --iid             Displays instance id (prefixed with 'Instance Id:')\n"
+
+"     --myname          Displays local node name in configuration (prefixed with 'Node Name:').\n"
+"     --mynid           Displays local node-id in configuration (prefixed with 'Node Id:').\n\n"
+
 "     --nameserver --ns Displays nameserver configuration (with begin/end brackets).\n"
 "     --node            Displays node configuration (with begin/end brackets).\n"
 "     --persist         Displays persist configuration (with begin/end brackets).\n\n"   
@@ -218,6 +238,42 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 //
+// Function/Method: DisplayId()
+//
+///////////////////////////////////////////////////////////////////////////////
+int DisplayId( void )
+{
+    int rc   = 0;
+
+    switch (TrafConfType)
+    {
+        case TrafConfType_ClusterId:
+            if ( DisplayLabel )
+            {
+                printf( "Cluster Id: " );
+            }
+            printf("%d", ClusterConfig.GetClusterId() );
+            break;
+        case TrafConfType_InstanceId:
+            if ( DisplayLabel )
+            {
+                printf( "Instance Id: " );
+            }
+            printf("%d", ClusterConfig.GetInstanceId() );
+            break;
+        default:
+            printf( "Invalid configuration type!\n" );
+            rc = -1;
+    }
+    if ( DisplayLabel )
+    {
+        printf( "\n" );
+    }
+    return(rc);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
 // Function/Method: DisplayNodeAttributes()
 //
 ///////////////////////////////////////////////////////////////////////////////
@@ -243,7 +299,7 @@
         printf( "node-id=%d;node-name=%s;"
                 "cores=%s;processors=%d;roles=%s\n"
               , lnodeConfig->GetNid()
-              , lnodeConfig->GetName()
+              , lnodeConfig->GetFqdn()
               , coresString
               , lnodeConfig->GetProcessors()
               , RoleTypeString( lnodeConfig->GetZoneType() )
@@ -286,7 +342,7 @@
         {
             printf( "-w " );
         }
-        printf( "%s ", NodeNameStr(lnodeConfig->GetName()) );
+        printf( "%s ", NodeNameStr(lnodeConfig->GetFqdn()) );
     }
 }
 
@@ -407,7 +463,7 @@
 ///////////////////////////////////////////////////////////////////////////////
 int DisplayConfigCounts( void )
 {
-    int rc   = -1;
+    int rc   = 0;
 
     switch (TrafConfType)
     {
@@ -441,6 +497,70 @@
             break;
         default:
             printf( "Invalid configuration type!\n" );
+            rc = -1;
+    }
+    if ( DisplayLabel )
+    {
+        printf( "\n" );
+    }
+    return(rc);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Function/Method: DisplayMyNode()
+//
+///////////////////////////////////////////////////////////////////////////////
+int DisplayMyNode( void )
+{
+    int rc   = 0;
+    char name[TC_PROCESSOR_NAME_MAX]; // hostname
+    CPNodeConfig * pnodeConfig = NULL;
+    CLNodeConfig * lnodeConfig = NULL;
+
+    gethostname(name, TC_PROCESSOR_NAME_MAX);
+    char *tmpptr = name;
+    while ( *tmpptr )
+    {
+        *tmpptr = (char)tolower( *tmpptr );
+        tmpptr++;
+    }
+
+    pnodeConfig = ClusterConfig.GetPNodeConfig( name );
+    if (pnodeConfig == NULL)
+    {
+        printf( "Local host %s is not in Trafodion Configuration!\n"
+              , name );
+        return(-1);
+    }
+
+    lnodeConfig = pnodeConfig->GetFirstLNodeConfig();
+    if (lnodeConfig == NULL)
+    {
+        printf( "Logical node for local host %s is not in Trafodion Configuration!\n"
+              , name );
+        return(-1);
+    }
+
+    switch (TrafConfType)
+    {
+        case TrafConfType_MyNodeName:
+            if ( DisplayLabel )
+            {
+                printf( "Node Name: " );
+            }
+            printf("%s", pnodeConfig->GetFqdn() );
+            break;
+        case TrafConfType_MyNodeId:
+            if ( DisplayLabel )
+            {
+                printf( "Node Id: " );
+            }
+            printf("%d", lnodeConfig->GetNid() );
+            break;
+        default:
+            printf( "Invalid configuration type!\n" );
+            rc = -1;
     }
     if ( DisplayLabel )
     {
@@ -635,6 +755,10 @@
 
     switch (TrafConfType)
     {
+        case TrafConfType_ClusterId:
+        case TrafConfType_InstanceId:
+            rc = DisplayId();
+            break;
         case TrafConfType_NodeConfig:
         case TrafConfType_NodeName:
         case TrafConfType_NodeName_w:
@@ -654,6 +778,10 @@
         case TrafConfType_NameServerConfig:
             rc = DisplayNameServerConfig( );
             break;
+        case TrafConfType_MyNodeName:
+        case TrafConfType_MyNodeId:
+            rc = DisplayMyNode( );
+            break;
         case TrafConfType_NodeId:
         case TrafConfType_PhysicalNodeId:
         case TrafConfType_ZoneId:
@@ -692,6 +820,14 @@
             DisplayUsage();
             return 0;
         }
+        else if ( strcasecmp( argv [argx], "-cid" ) == 0 )
+        {
+            TrafConfType = TrafConfType_ClusterId;
+        }
+        else if ( strcasecmp( argv [argx], "-iid" ) == 0 )
+        {
+            TrafConfType = TrafConfType_InstanceId;
+        }
         else if ( strcasecmp( argv [argx], "-name" ) == 0 )
         {
             TrafConfType = TrafConfType_NodeName;
@@ -710,6 +846,14 @@
             DisplayShortHost = true;
             TrafConfType = TrafConfType_NodeName_w;
         }
+        else if ( strcasecmp( argv [argx], "-myname" ) == 0 )
+        {
+            TrafConfType = TrafConfType_MyNodeName;
+        }
+        else if ( strcasecmp( argv [argx], "-mynid" ) == 0 )
+        {
+            TrafConfType = TrafConfType_MyNodeId;
+        }
         else if ( ( strcasecmp( argv [argx], "-nameserver" ) == 0 ) ||
                   ( strcasecmp( argv [argx], "-ns" ) == 0 ) )
         {
@@ -745,6 +889,26 @@
         {
             TrafConfType = TrafConfType_PersistConfig;
         }
+        else if ( strcasecmp( argv [argx], "--cid" ) == 0 )
+        {
+            TrafConfType = TrafConfType_ClusterId;
+            DisplayLabel = true;
+        }
+        else if ( strcasecmp( argv [argx], "--iid" ) == 0 )
+        {
+            TrafConfType = TrafConfType_InstanceId;
+            DisplayLabel = true;
+        }
+        else if ( strcasecmp( argv [argx], "--myname" ) == 0 )
+        {
+            TrafConfType = TrafConfType_MyNodeName;
+            DisplayLabel = true;
+        }
+        else if ( strcasecmp( argv [argx], "--mynid" ) == 0 )
+        {
+            TrafConfType = TrafConfType_MyNodeId;
+            DisplayLabel = true;
+        }
         else if ( strcasecmp( argv [argx], "--node" ) == 0 )
         {
             DisplayBeginEnd = true;
diff --git a/core/sqf/src/trafconf/trafconfig.cpp b/core/sqf/src/trafconf/trafconfig.cpp
index 3f0e549..1b0bab9 100644
--- a/core/sqf/src/trafconf/trafconfig.cpp
+++ b/core/sqf/src/trafconf/trafconfig.cpp
@@ -25,11 +25,13 @@
 
 using namespace std;
 
+#include <string.h>
 #include "tcdb.h"
 #include "tctrace.h"
 #include "trafconf/trafconfig.h"
 
 bool TcTraceEnabled = false;
+bool TcIsRealCluster = true;
 
 CTrafConfigTrace    TrafConfigTrace;
 CTcdb               TrafConfigDb;
@@ -88,6 +90,11 @@
         return( TCALREADYINIT );
     }
 
+    if ( getenv( "SQ_VIRTUAL_NODES" ) )
+    {
+        TcIsRealCluster = false;
+    }
+
     TcTraceEnabled = traceEnabled;
     if (TcTraceEnabled)
     {
@@ -233,8 +240,20 @@
     }
 
     int rc = TCDBOPERROR;
+    char fqdn_name[TC_PROCESSOR_NAME_MAX];
 
-    rc = TrafConfigDb.AddPNodeData( node_config->node_name
+    if (strlen(node_config->domain_name))
+    {
+        snprintf( fqdn_name, sizeof(fqdn_name), "%s.%s"
+                , node_config->node_name
+                , node_config->domain_name );
+    }
+    else
+    {
+        strncpy( fqdn_name, node_config->node_name, sizeof(fqdn_name) );
+    }
+
+    rc = TrafConfigDb.AddPNodeData( fqdn_name
                                   , node_config->pnid
                                   , node_config->excluded_first_core
                                   , node_config->excluded_last_core );
diff --git a/core/sql/exp/ExpLOBprocess.cpp b/core/sql/exp/ExpLOBprocess.cpp
index 190be7e..7f239f1 100644
--- a/core/sql/exp/ExpLOBprocess.cpp
+++ b/core/sql/exp/ExpLOBprocess.cpp
@@ -345,11 +345,6 @@
                msg->u.shutdown.pid,
                msg->u.shutdown.level);
         break;
-    case MS_MsgType_TmSyncAbort:
-    case MS_MsgType_TmSyncCommit:
-        break;
-    case MS_MsgType_UnsolicitedMessage:
-        break;
     default:
         break;
     }
diff --git a/install/python-installer/configs/db_config_default.ini b/install/python-installer/configs/db_config_default.ini
index 8bd6d2f..157ecd4 100644
--- a/install/python-installer/configs/db_config_default.ini
+++ b/install/python-installer/configs/db_config_default.ini
@@ -56,10 +56,18 @@
 # no need to provide it if the package can be found in current installer's directory
 traf_package =
 
+# trafodion log files location
+# if not provided, it will be set to the /var/log/trafodion
+traf_log =
+
+# trafodion temporary working files location
+# if not provided, it will be set to the /var/lib/trafodion
+traf_var =
+
 # the number of dcs servers on each node
 dcs_cnt_per_node = 4
 
-# scratch file location, seperated by comma if more than one
+# scratch file location, separated by comma if more than one
 scratch_locs = $TRAF_VAR
 
 # start trafodion instance after installation completed
@@ -123,3 +131,12 @@
 admin_principal =
 # admin password for admin principal, it is used to create trafodion user's principal and keytab
 kdcadmin_pwd =
+
+#######################################
+# For Multi Instance support
+#######################################
+cluster_name = TRAFODION
+traf_instance_name = TRAFODION
+traf_instance_id = 1
+traf_cluster_id = 1
+traf_root_znode = /trafodion
diff --git a/install/python-installer/configs/prompt.json b/install/python-installer/configs/prompt.json
index cab17d1..dcbcc19 100644
--- a/install/python-installer/configs/prompt.json
+++ b/install/python-installer/configs/prompt.json
@@ -226,6 +226,12 @@
       "default":"1",
       "isdigit":true
   },
+  "traf_cluster_id":
+  {
+      "prompt":"Enter a cluster ID for this Trafodion installation",
+      "default":"1",
+      "isdigit":true
+  },
   "use_data_node":
   {
       "prompt":"Install Trafodion nodes on all DataNodes",
diff --git a/install/python-installer/db_install.py b/install/python-installer/db_install.py
index 0057119..6384c76 100755
--- a/install/python-installer/db_install.py
+++ b/install/python-installer/db_install.py
@@ -280,6 +280,13 @@
         cfgs['node_list'] = ','.join(rsnodes)
         cfgs['first_rsnode'] = rsnodes[0] # first regionserver node
 
+        cfgs['traf_cluster_id'] = '1'
+        cfgs['traf_instance_id'] = '1'
+        cfgs['traf_instance_name'] = 'TRAFODION'
+
+    ### set Cluster ID
+#    g('traf_cluster_id')
+
     # check node connection
     for node in cfgs['node_list'].split(','):
         rc = os.system('ping -c 1 %s >/dev/null 2>&1' % node)
diff --git a/install/python-installer/scripts/traf_sqconfig.py b/install/python-installer/scripts/traf_sqconfig.py
index de018a5..b1f212f 100755
--- a/install/python-installer/scripts/traf_sqconfig.py
+++ b/install/python-installer/scripts/traf_sqconfig.py
@@ -39,33 +39,42 @@
     if traf_conf == '': err('TRAF_CONF var is empty')
     sqconfig_file = traf_conf + '/sqconfig'
 
-    core, processor = run_cmd("lscpu|grep -E '(^CPU\(s\)|^Socket\(s\))'|awk '{print $2}'").split('\n')[:2]
-    core = int(core)-1 if int(core) <= 256 else 255
+    traf_var = os.environ['TRAF_VAR']
+    if traf_var == '': err('TRAF_VAR var is empty')
+    sqconfig_db_file = traf_var + '/sqconfig.db'
 
-    lines = ['begin node\n']
-    for node_id, node in enumerate(nodes):
-        line = 'node-id=%s;node-name=%s;cores=0-%d;processors=%s;roles=connection,aggregation,storage\n' % (node_id, node, core, processor)
-        lines.append(line)
+    # If the configuration database file is not yet created,
+    # build the 'sqconfig' file with the nodes specified and compile it.
+    if not os.path.exists(sqconfig_db_file):
+        core, processor = run_cmd("lscpu|grep -E '(^CPU\(s\)|^Socket\(s\))'|awk '{print $2}'").split('\n')[:2]
+        core = int(core)-1 if int(core) <= 256 else 255
 
-    lines.append('end node\n')
-    lines.append('\n')
-    lines.append('begin overflow\n')
+        lines = ['begin node\n']
+        for node_id, node in enumerate(nodes):
+            line = 'node-id=%s;node-name=%s;cores=0-%d;processors=%s;roles=connection,aggregation,storage\n' % (node_id, node, core, processor)
+            lines.append(line)
 
-    for scratch_loc in scratch_locs:
-        line = 'hdd %s\n' % scratch_loc
-        lines.append(line)
+        lines.append('end node\n')
+        lines.append('\n')
+        lines.append('begin overflow\n')
 
-    lines.append('end overflow\n')
+        for scratch_loc in scratch_locs:
+            line = 'hdd %s\n' % scratch_loc
+            lines.append(line)
 
-    # write out the node section
-    with open(sqconfig_file, 'w') as f:
-        f.writelines(lines)
+        lines.append('end overflow\n')
 
-    print 'sqconfig generated successfully!'
+        # write out the node section
+        with open(sqconfig_file, 'w') as f:
+            f.writelines(lines)
 
-    run_cmd('sqgen')
+        print 'sqconfig generated successfully!'
 
-    print 'sqgen ran successfully!'
+        run_cmd('sqgen')
+
+        print 'sqgen ran successfully!'
+    else:
+        print 'Using existing configuration (%s)' % sqconfig_file
 
 # main
 try:
diff --git a/install/python-installer/scripts/traf_user.py b/install/python-installer/scripts/traf_user.py
index ef2cb50..377ef74 100755
--- a/install/python-installer/scripts/traf_user.py
+++ b/install/python-installer/scripts/traf_user.py
@@ -128,6 +128,15 @@
 export dcs_cnt_per_node="%s"
 """ % (dbcfgs['hbase_xml_file'], dbcfgs['hbase_lib_path'], dbcfgs['traf_user'], dbcfgs['traf_version'], dbcfgs['dcs_cnt_per_node'])
 
+    # save additonal configs for multi instance support
+    trafodion_config += """
+export TRAF_CLUSTER_NAME="%s"
+export TRAF_INSTANCE_NAME="%s"
+export TRAF_CLUSTER_ID="%s"
+export TRAF_INSTANCE_ID="%s"
+export TRAF_ROOT_ZNODE="/%s"
+""" % (dbcfgs['cluster_name'], dbcfgs['traf_instance_name'], dbcfgs['traf_cluster_id'], dbcfgs['traf_instance_id'], dbcfgs['traf_user'])
+
     run_cmd('mkdir -p %s' % TRAF_CFG_DIR)
     write_file(TRAF_CFG_FILE, trafodion_config)