///////////////////////////////////////////////////////////////////////////////
//
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
//
///////////////////////////////////////////////////////////////////////////////

#include <iostream>

using namespace std;

#include <fcntl.h>
#include <semaphore.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <dirent.h>
#include <sys/file.h>
#include <sys/ipc.h>
#include <sys/ptrace.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <errno.h>
#include <sys/epoll.h>

#include "props.h"
#include "localio.h"
#include "mlio.h"
#include "monlogging.h"
#ifdef USE_FORK_SUSPEND_RESUME
#include "monrs.h"
#endif // USE_FORK_SUSPEND_RESUME
#include "monsonar.h"
#include "montrace.h"
#include "redirector.h"
#include "healthcheck.h"
#include "lock.h"
#include "config.h"
#include "device.h"
#include "monitor.h"
#include "msgdef.h"
#include "clusterconf.h"
#include "lnode.h"
#include "pnode.h"
#include "process.h"
#include "intprocess.h"
#include "gentrap.h"
#include "nameserver.h"

#include "reqqueue.h"
extern CReqQueue ReqQueue;

#include "replicate.h"

#ifndef NAMESERVER_PROCESS
#include "ptpclient.h"
#endif

extern bool IsAgentMode;
extern bool IsMaster;

extern bool PidMap;
extern int Measure;
extern int trace_level;
extern int MyPNID;
extern char MyCommPort[MPI_MAX_PORT_NAME];
extern char Node_name[MPI_MAX_PROCESSOR_NAME];
extern sigset_t SigSet;
extern CLock MemModLock;
extern CMonitor *Monitor;
#ifndef NAMESERVER_PROCESS
extern bool NameServerEnabled;
extern CNameServer *NameServer;
extern CPtpClient *PtpClient;
#endif
extern CNodeContainer *Nodes;
extern CConfigContainer *Config;
#ifndef NAMESERVER_PROCESS
extern CDeviceContainer *Devices;
#endif
extern CNode *MyNode;
extern CMonStats *MonStats;
#ifndef NAMESERVER_PROCESS
extern CRedirector Redirector;
#endif
extern CHealthCheck HealthCheck;
extern CReplicate Replicator;
extern CIntProcess IntProcess;

extern char *ErrorMsg (int error_code);
extern _TM_Txid_External invalid_trans( void );
extern _TM_Txid_External null_trans( void );
extern bool isEqual( _TM_Txid_External trans1, _TM_Txid_External trans2 );
extern bool isNull( _TM_Txid_External transid );
extern bool isInvalid( _TM_Txid_External transid );

extern bool IAmIntegrated;
extern bool SMSIntegrating;

extern const char *NodePhaseString( NodePhase phase );
extern const char *ProcessTypeString( PROCESSTYPE type );

extern int monitorArgc;
extern char monitorArgv[MAX_ARGS][MAX_ARG_SIZE];

CProcess::CProcess (CProcess * parent, int nid, int pid,
#ifdef NAMESERVER_PROCESS
                    Verifier_t verifier,
#endif
                    PROCESSTYPE type,
                    int priority, int backup, bool debug, bool unhooked,
                    char *name,
#ifdef NAMESERVER_PROCESS
                    char *path,
                    char *ldpath,
                    char *program,
#else
                    strId_t pathStrId, strId_t ldpathStrId, strId_t programStrId, 
#endif
                    char *infile, char *outfile)
  :
    Nid (nid),
    Pid (pid),
#ifdef NAMESERVER_PROCESS
    Verifier ( verifier ),
#else
    Verifier ( -1 ),
#endif
    PidAtFork_ (pid),
    Type (type),
    Event_messages (false),
    System_messages (false),
    Paired (false),
    Clone (false),
    Debug(debug),
    DeletePending (false),
    StartupCompleted (false),
    Backup (backup),
    Abended (false),
    Attached (false),
    abort_(false),
    Persistent (false),
    UnHooked (unhooked),
    Nowait (false),
    PersistentCreateTime (0),
    PersistentRetries (0),
    Tag ( 0 ),
    Parent (parent),
    PairParentNid (-1),
    PairParentPid (-1),
    PairParentVerifier (-1),
    ReplyTag (REPLY_TAG),           // will be set again when we have a pending reply
    OpenedCount (0),
    LastNid (nid),
    DumpState (Dump_Ready),
    DumpStatus (Dump_Success),
    DumperNid (-1),
    DumperPid (-1),
    DumperVerifier (-1),
    priorPid_ (0),
    State_ (State_Unknown),
    next_(NULL),
    prev_(NULL),
    nextL_(NULL),
    prevL_(NULL),
    unsolTmSyncCount_(0),
    Last_error (MPI_SUCCESS)
    , argc_(0)
    , userArgvLen_ (0)
    , userArgv_ (NULL)
#ifdef NAMESERVER_PROCESS
    , path_(path)
    , ldpath_(ldpath)
    , program_(program)
#else
    , path_()
    , ldpath_()
    , program_()
    , programStrId_(programStrId)
    , pathStrId_(pathStrId)
    , ldpathStrId_(ldpathStrId)
#endif
    , firstInstance_(true)
    , cmpOrEsp_(false)
    , trafConf_()
    , trafHome_()
    , trafVar_()
    , fd_stdin_(-1)
    , fd_stdout_(-1)
    , fd_stderr_(-1)
    , owned_(false)
    , ownerId_(0)
    , replRefCount_(0)
    , requestBuf_ (NULL)
#ifndef NAMESERVER_PROCESS
    , NoticeHead(NULL)
    , NoticeTail(NULL)
#endif
#ifdef NAMESERVER_PROCESS
    , monSockFd_(-1)
    , origPNidNs_(-1)
#endif
{
    char la_buf[MON_STRING_BUF_SIZE];

    const char method_name[] = "CProcess::CProcess";
    TRACE_ENTRY;

    // Add eyecatcher sequence as a debugging aid
    memcpy(&eyecatcher_, "PROC", 4);

    hangupTime_.tv_sec = 0;
    hangupTime_.tv_nsec = 0;

    Port[0] = '\0';
    STRCPY (Name, name);
    CreationTime.tv_sec = 0;
    CreationTime.tv_nsec = 0;
    if ( infile && strcmp(infile,"#default") != 0)
        infile_ = infile;
    if ( outfile && strcmp(outfile,"#default") != 0)
        outfile_ = outfile;

#ifndef NAMESERVER_PROCESS
    Config->strIdToString(programStrId_, program_ );
#endif

    switch (Type)
    {
        case ProcessType_ASE:
        case ProcessType_TSE:
            Priority = (priority<TSE_BASE_NICE?TSE_BASE_NICE:priority);
            break;
        case ProcessType_DTM:
            Priority = (priority<DTM_BASE_NICE?DTM_BASE_NICE:priority);
            break;
        case ProcessType_NameServer:
        case ProcessType_Watchdog:
        case ProcessType_PSD:
            Priority = priority;
            break;
        case ProcessType_AMP:
        case ProcessType_Backout:
        case ProcessType_VolumeRecovery:
        case ProcessType_MXOSRVR:
        case ProcessType_PERSIST:
        case ProcessType_SMS:
        case ProcessType_SPX:
        case ProcessType_SSMP:
        case ProcessType_TMID:
        case ProcessType_Generic:
            Priority = (priority<APP_BASE_NICE?APP_BASE_NICE:priority);
            break;
        default:
            Priority = priority;
            snprintf(la_buf, sizeof(la_buf),
                     "[CProcess::CProcess], Invalid process type!\n");
            mon_log_write(MON_PROCESS_PROCESS_1, SQ_LOG_ERR, la_buf);
    }

    switch (Type)
    {
        case ProcessType_DTM:
        case ProcessType_PSD:
        case ProcessType_PERSIST:
        case ProcessType_SMS:
        case ProcessType_SPX:
        case ProcessType_SSMP:
        case ProcessType_TMID:
        case ProcessType_Watchdog:
        case ProcessType_NameServer:
            Persistent = true;
            break;
        default:
            break;
    }

    if (parent)
    {
        // the process is being started at the request of a parent process
        Parent_Nid = parent->Nid;
        Parent_Pid = parent->Pid;
        Parent_Verifier = parent->Verifier;
        if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL))
           trace_printf("%s@%d - Process (%d, %d) has parent (%d, %d)\n", method_name, __LINE__, Nid, Pid, Parent_Nid, Parent_Pid);
        if (Backup)
        {
            PairParentNid = parent->PairParentNid;
            PairParentPid = parent->PairParentPid;
            parent->Parent_Nid = Nid;
            parent->Parent_Pid = Pid;
            if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL))
                trace_printf("%s@%d - Backup process %s (%d, %d) has process "
                             "pair parent (%d, %d) and primary process "
                             "(%d, %d)\n",
                             method_name, __LINE__, Name, Nid, Pid,
                             PairParentNid, PairParentPid,
                             parent->Nid, parent->Pid);
        }
    }
    else
    {
        // the process is being started by the monitor at initiation time
        Parent_Nid = -1;
        Parent_Pid = -1;
        Parent_Verifier = -1;
        if (backup)
        {
            snprintf(la_buf, sizeof(la_buf),
                     "[CProcess::CProcess], No Primary for Backup process!\n");
            mon_log_write(MON_PROCESS_PROCESS_2, SQ_LOG_ERR, la_buf);
        }
    }
    if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL))
       trace_printf("%s@%d" " - Process " "%s (nid=%d, priority=%d)" " created @ " "%p""\n", method_name, __LINE__, Name, Nid, Priority, this);

    Monitor->IncProcessCount();

    // Record statistics (sonar counters)
    if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
       MonStats->NumProcsIncr();

    TRACE_EXIT;
}

CProcess::~CProcess (void)
{
    const char method_name[] = "CProcess::~CProcess";
    TRACE_ENTRY;
    Monitor->DecrProcessCount();

    if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL))
       trace_printf("%s@%d" " - Process " "%s(%d,%d:%d)" " destroyed @ " "%p""\n", method_name, __LINE__, Name, Nid, Pid, Verifier, this);

    // Record statistics (sonar counters)
    if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
       MonStats->NumProcsDecr();


#ifndef NAMESERVER_PROCESS
    deathInterestLock_.lock();
    CNotice *notice = NoticeHead;
    while (notice)
    {
        // Send death notice messages to all opened processes
        notice->DeLink (&NoticeHead, &NoticeTail);
        delete notice;

        notice = NoticeHead;
    }
    deathInterestLock_.unlock();
#endif

    // For SSM process, release any undelivered pending notices.
    struct message_def * deathNotice;
    while ((deathNotice = GetDeathNotice()) != NULL)
    {
        delete deathNotice;
    }

    delete [] userArgv_;

#ifndef NAMESERVER_PROCESS
    if (fd_stdin_ != -1 && !Clone)
    {
        Redirector.tryShutdownPipeFd(Pid, fd_stdin_, false);
    }

    if (fd_stdout_ != -1)
    {
        Redirector.tryShutdownPipeFd(Pid, fd_stdout_, true);
    }

    if (fd_stderr_ != -1)
    {
        Redirector.tryShutdownPipeFd(Pid, fd_stderr_, false);
    }
#endif

    // Remove the fifos associated with this process (if any)
    if (fifo_stdin_.size() != 0)
    {
        unlink(fifo_stdin_.c_str());
    }

    if (fifo_stdout_.size() != 0)
    {
        unlink(fifo_stdout_.c_str());
    }

    if (fifo_stderr_.size() != 0)
    {
        unlink(fifo_stderr_.c_str());
    }

    // Alter eyecatcher sequence as a debugging aid to identify deleted object
    memcpy(&eyecatcher_, "proc", 4);

    TRACE_EXIT;
}

#ifndef NAMESERVER_PROCESS
const char* CProcess::path()
{
    Config->strIdToString(pathStrId_, path_ );
    return( path_.c_str() );
}
#endif

#ifndef NAMESERVER_PROCESS
const char* CProcess::ldpath()
{
    Config->strIdToString(ldpathStrId_, ldpath_ );
    return( ldpath_.c_str() );
}
#endif

int CProcess::getUserArgs( char user_argv[MAX_ARGS][MAX_ARG_SIZE] )
{
    const char *pUserArgv = userArgv_;
    int i, arglen;
    for (i = 0; i < argc_; i++)
    {
        arglen = strlen (pUserArgv) + 1;
        strcpy( user_argv[i], pUserArgv );
        pUserArgv += arglen;
    }
    strcpy( user_argv[i], "" );
    return(argc_);
}

void CProcess::userArgs ( int argc, int argvLen, const char * argvList )
{
    const char method_name[] = "CProcess::userArgs";
    TRACE_ENTRY;

    argc_ = argc;
    userArgvLen_ = argvLen;
    if ( userArgv_ != NULL )
    {
        delete[] userArgv_;
    }
    userArgv_ = new char[ argvLen ];
    memcpy(userArgv_, argvList, argvLen);

    TRACE_EXIT;
}

void CProcess::userArgs ( int argc, char user_argv[MAX_ARGS][MAX_ARG_SIZE] )
{
    const char method_name[] = "CProcess::userArgs";
    TRACE_ENTRY;

    argc_ = argc;

    // Compute amount of space need to store argument strings
    userArgvLen_ = 0;
    for (int i = 0; i < argc; i++)
    {
        userArgvLen_ += strlen(user_argv[i]) + 1;
    }
    if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL))
        trace_printf("%s@%d - Copying arguments argc=%d, argvSize=%d\n",
                     method_name, __LINE__, argc, userArgvLen_);
    if (userArgvLen_ != 0)
    {
        userArgv_ = new char[userArgvLen_];
    }

    char *pUserArgv = userArgv_;
    for (int i = 0; i < argc; i++)
    {
        if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL))
            trace_printf("%s@%d - name=%s, Copying user_argv[%d]='%s'\n", method_name, __LINE__, Name, i, user_argv[i]);
        strcpy (pUserArgv, user_argv[i]);
        pUserArgv += strlen(user_argv[i]) + 1;
    }

    TRACE_EXIT;
}

void CProcess::validateObj( void )
{
    if (strncmp((const char *)&eyecatcher_, "PROC", 4) !=0 )
    {  // Not a valid object
        abort();
    }
}

#ifndef NAMESERVER_PROCESS
bool CProcess::CancelDeathNotification( int nid
                                      , int pid
                                      , Verifier_t verifier
                                      , _TM_Txid_External trans_id )
{
    bool status = FAILURE;
    CNotice *next;

    const char method_name[] = "CProcess::CancelDeathNotification";
    TRACE_ENTRY;

    deathInterestLock_.lock();
    CNotice *notice = NoticeHead;

    while( notice )
    {
        if ((( notice->Nid == nid    ) &&
             ( notice->Pid == pid    ) &&
             ( notice->verifier_ == verifier ) &&
             ( isInvalid( trans_id ) || isEqual( notice->TransID, trans_id )))
         || (( nid == -1 || pid == -1             ) &&
             ( isEqual(notice->TransID, trans_id) ) ) )
        {
            next = notice->GetNext();

            if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL
                                  | TRACE_PROCESS_DETAIL))
            {
                trace_printf( "%s@%d - Process %s (%d, %d:%d) deleting death "
                              "notice interest for %s (%d, %d:%d), "
                              "trans_id=%lld.%lld.%lld.%lld\n"
                            , method_name, __LINE__
                            , Name
                            , Nid
                            , Pid
                            , Verifier
                            , notice->name_.c_str()
                            , notice->Nid
                            , notice->Pid
                            , notice->verifier_
                            , notice->TransID.txid[0]
                            , notice->TransID.txid[1]
                            , notice->TransID.txid[2]
                            , notice->TransID.txid[3] );
            }

            notice->DeLink(&NoticeHead, &NoticeTail);
            delete notice;

            notice = next;

            status = SUCCESS;
        }
        else
        {
            notice = notice->GetNext();
        }
    }

    deathInterestLock_.unlock();

    TRACE_EXIT;
    return status;
}
#endif

#ifndef NAMESERVER_PROCESS
// Death notice registration for a process
bool CProcess::procExitReg(CProcess *targetProcess,
                           _TM_Txid_External transId)
{
    const char method_name[] = "CProcess::ProcExitReg";
    TRACE_ENTRY;

    bool status = FAILURE;

    if ( Nid != targetProcess->GetParentNid() ||
         Pid != targetProcess->GetParentPid())
    {   // This process is not the parent of the target process (parent
        // processes automatically get process death notifications.)

        nidPid_t target = { targetProcess->Nid, targetProcess->Pid };
        deathInterestLock_.lock();
        // Add entry to list of processes that are being monitored
        // by this process.
        deathInterest_.push_back( target );
        // Add entry to set of nids of processes that are being monitored
        // by this process.
        deathInterestNid_.insert( targetProcess->Nid );
        deathInterestLock_.unlock();

        // Register interest with the target process
        targetProcess->RegisterDeathNotification( Nid
                                                , Pid
                                                , Verifier
                                                , Name
                                                , transId );
        status = SUCCESS;

        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
        {
            trace_printf("%s@%d - Process %s (%d, %d) registered interest "
                         "in death of process %s (%d, %d), "
                         "trans_id=%lld.%lld.%lld.%lld\n",
                         method_name, __LINE__, Name, Nid, Pid,
                         targetProcess->Name, targetProcess->Nid,
                         targetProcess->Pid,
                         transId.txid[0], transId.txid[1], transId.txid[2],
                         transId.txid[3] );
        }
    }


    TRACE_EXIT;
    return status;
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcess::procExitNotifierNodes( void )
{
    const char method_name[] = "CProcess::procExitNotifierNodes";
    TRACE_ENTRY;

    CLNode *targetLNode = NULL;
    CNode  *targetNode = NULL;
    nidSet_t::iterator it;

    // Remove death notice registration for all entries on list
    deathInterestLock_.lock();
    for ( it = deathInterestNid_.begin(); it != deathInterestNid_.end(); ++it)
    {
        targetLNode = Nodes->GetLNode ( *it );
        if (targetLNode)
        {
            targetNode = targetLNode->GetNode();
        }

        if ( targetNode )
        {
            if (NameServerEnabled && targetNode->GetPNid() != MyPNID)
            {
                // Forward the process exit to the target node
                int rc = PtpClient->ProcessExit( this 
                                               , targetLNode->GetNid()
                                               , targetNode->GetName() ); 
                if (rc)
                {
                    char la_buf[MON_STRING_BUF_SIZE];
                    snprintf( la_buf, sizeof(la_buf)
                            , "[%s] - Can't send process exit "
                              "for process %s (%d, %d) "
                              "to target node %s, nid=%d\n"
                            , method_name
                            , GetName()
                            , GetNid()
                            , GetPid()
                            , targetLNode->GetNode()->GetName()
                            , targetLNode->GetNid() );
                    mon_log_write(MON_PROCESS_PROCEXITNOTIFIERNODES_1, SQ_LOG_ERR, la_buf);
                }
            }
        }
    }
    deathInterestNid_.clear();
    deathInterestLock_.unlock();

    TRACE_EXIT;
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcess::procExitUnregAll ( _TM_Txid_External transId )
{
    const char method_name[] = "CProcess::procExitUnregAll";
    TRACE_ENTRY;

    CLNode *node;
    CProcess *targetProcess = NULL;
    nidPidList_t::iterator it;

    // Remove death notice registration for all entries on list
    deathInterestLock_.lock();
    for ( it = deathInterest_.begin(); it != deathInterest_.end(); ++it)
    {
        node = Nodes->GetLNode ( it->nid );
        targetProcess = NULL;
        if (node)
        {
            targetProcess = node->GetProcessL( it->pid );
        }

        if ( targetProcess )
        {
            if (NameServerEnabled && targetProcess->IsClone())
            {
                CLNode *targetLNode = Nodes->GetLNode( targetProcess->GetNid() );
            
                int rc = -1;
                // Forward the process cancel death notification to the target node
                rc = PtpClient->ProcessNotify( targetProcess->GetNid()
                                             , targetProcess->GetPid()
                                             , targetProcess->GetVerifier()
                                             , transId
                                             , true // cancel target's death notification
                                             , this // of this process
                                             , targetLNode->GetNid()
                                             , targetLNode->GetNode()->GetName() ); 
                if (rc)
                {
                    char la_buf[MON_STRING_BUF_SIZE];
                    snprintf( la_buf, sizeof(la_buf)
                            , "[%s] - Can't send process notify request "
                              "for process %s (%d, %d) "
                              "to target node %s, nid=%d\n"
                            , method_name
                            , targetProcess->GetName()
                            , targetProcess->GetNid()
                            , targetProcess->GetPid()
                            , targetLNode->GetNode()->GetName()
                            , targetLNode->GetNid() );
                    mon_log_write(MON_PROCESS_PROCEXITUNREGALL_1, SQ_LOG_ERR, la_buf);
                }
            }
            
            targetProcess->CancelDeathNotification( Nid
                                                  , Pid
                                                  , Verifier
                                                  , transId );
        }
    }
    deathInterest_.clear();
    deathInterestLock_.unlock();

    TRACE_EXIT;
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcess::childAdd ( int nid, int pid )
{
    const char method_name[] = "CProcess::childAdd";
    TRACE_ENTRY;

    nidPid_t child = { nid, pid };
    childrenListLock_.lock();
    children_.push_back ( child );
    childrenListLock_.unlock();

    TRACE_EXIT;
}

int CProcess::childCount ( void )
{
    const char method_name[] = "CProcess::childCount";
    TRACE_ENTRY;

    childrenListLock_.lock();
    int count = children_.size();
    childrenListLock_.unlock();

    TRACE_EXIT;
    return(count);
}

void CProcess::childRemove ( int nid, int pid )
{
    const char method_name[] = "CProcess::childRemove";
    TRACE_ENTRY;

    nidPidList_t::iterator it;

    childrenListLock_.lock();
    for ( it = children_.begin(); it != children_.end(); ++it)
    {
        if (it->nid == nid && it->pid == pid )
        {
            children_.erase ( it );
            break;
        }
    }
    childrenListLock_.unlock();

    TRACE_EXIT;
}

bool CProcess::childRemoveFirst ( nidPid_t & child)
{
    const char method_name[] = "CProcess::childRemoveFirst";
    TRACE_ENTRY;

    bool result = false;

    childrenListLock_.lock();
    if ( !children_.empty() )
    {
        child = children_.front ();
        children_.pop_front ();
        result = true;

    }
    childrenListLock_.unlock();

    TRACE_EXIT;

    return result;
}

void CProcess::childUnHookedAdd( int nid, int pid )
{
    const char method_name[] = "CProcess::childUnHookedAdd";
    TRACE_ENTRY;

    if (trace_settings & (TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
        trace_printf( "%s@%d adding unhooked child (%d:%d)\n"
                    , method_name, __LINE__
                    , nid, pid );

    nidPid_t child = { nid, pid };
    childrenListLock_.lock();
    childrenUnHooked_.push_back ( child );
    childrenListLock_.unlock();

    TRACE_EXIT;
}

int CProcess::childUnHookedCount( void )
{
    const char method_name[] = "CProcess::childUnHookedCount";
    TRACE_ENTRY;

    childrenListLock_.lock();
    int count = childrenUnHooked_.size();
    childrenListLock_.unlock();

    TRACE_EXIT;
    return(count);
}

void CProcess::childUnHookedRemove( int nid, int pid )
{
    const char method_name[] = "CProcess::childUnHookedRemove";
    TRACE_ENTRY;

    nidPidList_t::iterator it;

    childrenListLock_.lock();
    for ( it = childrenUnHooked_.begin(); it != childrenUnHooked_.end(); ++it)
    {
        if (it->nid == nid && it->pid == pid )
        {
            childrenUnHooked_.erase ( it );
            break;
        }
    }
    childrenListLock_.unlock();

    TRACE_EXIT;
}

bool CProcess::childUnHookedRemoveFirst( nidPid_t & child)
{
    const char method_name[] = "CProcess::childUnHookedRemoveFirst";
    TRACE_ENTRY;

    bool result = false;

    childrenListLock_.lock();
    if ( !childrenUnHooked_.empty() )
    {
        child = childrenUnHooked_.front ();
        childrenUnHooked_.pop_front ();
        result = true;

    }
    childrenListLock_.unlock();

    TRACE_EXIT;

    return result;
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcess::CompleteDump(DUMPSTATUS status, char *core_file)
{
    CProcess           *dumper;
    struct message_def *msg;

    const char method_name[] = "CProcess::CompleteDump";
    TRACE_ENTRY;

    DumpStatus = status;

    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
       trace_printf("%s@%d - Dumper Process nid=%d, pid=%d:%d\n",
                    method_name, __LINE__, DumperNid, DumperPid, DumperVerifier);
    dumper = Nodes->GetLNode (DumperNid)->GetProcessL(DumperPid);
    if (dumper && MyNode->IsMyNode(DumperNid))
    {
        if ( (DumperVerifier == -1) || (DumperVerifier == dumper->GetVerifier()) )
        {
            msg = parentContext();
            if ( msg )
            { // reply to parent pending, so send reply
                msg->noreply = false;
                msg->type = MsgType_Service;
                msg->u.reply.type = ReplyType_Dump;
                msg->u.reply.u.dump.nid = Nid;
                msg->u.reply.u.dump.pid = Pid;
                msg->u.reply.u.dump.verifier = Verifier;
                if (status == Dump_Success)
                {
                    STRCPY(msg->u.reply.u.dump.core_file, core_file);
                    msg->u.reply.u.dump.return_code = MPI_SUCCESS;
                }
                else
                {
                    msg->u.reply.u.dump.core_file[0] = 0;
                    msg->u.reply.u.dump.return_code = MPI_ERR_EXITED;
                }
                CRequest::lioreply (msg, dumper->GetPid());
                parentContext( NULL );
            }
        }
    }

    DumpState = Dump_Ready;

    TRACE_EXIT;
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcess::CompleteProcessStartup (char *port, int os_pid, bool event_messages,
                                       bool system_messages, bool preclone,
                                       struct timespec *creation_time, int /*origPNidNs*/)
{
    const char method_name[] = "CProcess::CompleteProcessStartup";
    TRACE_ENTRY;

    STRCPY (Port, port);
    Pid = os_pid;
    Event_messages = event_messages;
    System_messages = system_messages;

    if (preclone)
    {
        Clone = true;
    }

    if (!Clone)
    {
        if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL))
            trace_printf("%s@%d: process %s (%d, %d), preclone=%d"
                         ", clone=%d\n",
                         method_name, __LINE__, Name,
                         Nid, os_pid, preclone, Clone);
        StartupCompleted = true;
        if (os_pid != -1)
        {
            if ( MyNode->IsMyNode(Nid) )
            {
                if ( NameServerEnabled )
                {
                    int rc = -1;
                    // Register process in Name Server
                    rc = NameServer->ProcessNew(this); // in reqQueue thread (CExtStartupReq)
                    if (rc)
                    {
                        char la_buf[MON_STRING_BUF_SIZE];
                        snprintf( la_buf, sizeof(la_buf)
                                , "[%s] - Can't register new process "
                                  "%s (%d, %d) "
                                  "to Name Server process\n"
                                , method_name
                                , GetName()
                                , GetNid()
                                , GetPid() );
                        mon_log_write(MON_PROCESS_COMPLETESTARTUP_1, SQ_LOG_ERR, la_buf);
                    }

                    if (Parent_Nid != -1)
                    {
                        if (Parent_Nid != Nid)
                        {
                            // Tell the parent node the current state of the process
                            rc = PtpClient->ProcessClone(this);
                            if (rc)
                            {
                                char la_buf[MON_STRING_BUF_SIZE];
                                CLNode *parentLNode = NULL;
                                parentLNode = Nodes->GetLNode( GetParentNid() );
                                snprintf( la_buf, sizeof(la_buf)
                                        , "[%s] - Can't send process clone request"
                                          "for process %s (%d, %d) "
                                          "to parent node %s, nid=%d\n"
                                        , method_name
                                        , GetName()
                                        , GetNid()
                                        , GetPid()
                                        , parentLNode->GetNode()->GetName()
                                        , parentLNode->GetNid() );
                                mon_log_write(MON_PROCESS_COMPLETESTARTUP_2, SQ_LOG_ERR, la_buf);
                            }
                        }
                    }
                }
                else
                {
                    // Replicate the clone to other nodes
                    CReplClone *repl = new CReplClone(this);
                    Replicator.addItem(repl);
                }
            }
            else
            {
                Clone = true;
            }
        }
        else
        {
            // TODO: What does an os_pid == -1 mean?
            if ( NameServerEnabled )
            {
                if (Parent_Nid != -1)
                {
                    if (Parent_Nid != Nid)
                    {
                        int rc = -1;
                        // Tell the parent node the current state of the process
                        rc = PtpClient->ProcessClone(this);
                        if (rc)
                        {
                            char la_buf[MON_STRING_BUF_SIZE];
                            CLNode *parentLNode = NULL;
                            parentLNode = Nodes->GetLNode( GetParentNid() );
                            snprintf( la_buf, sizeof(la_buf)
                                    , "[%s] - Can't send process clone request"
                                      "for process %s (%d, %d) "
                                      "to parent node %s, nid=%d\n"
                                    , method_name
                                    , GetName()
                                    , GetNid()
                                    , GetPid()
                                    , parentLNode->GetNode()->GetName()
                                    , parentLNode->GetNid() );
                            mon_log_write(MON_PROCESS_COMPLETESTARTUP_3, SQ_LOG_ERR, la_buf);
                        }
                    }
                }
            }
            else
            {
                // Replicate the clone to other nodes
                CReplClone *repl = new CReplClone(this);
                Replicator.addItem(repl);
            }
        }
    }

    if (!Clone)
    {
        // check if we need to setup any associated devices.
        if ((Type == ProcessType_TSE) ||
            (Type == ProcessType_ASE)   )
        {
             Devices->CreateDevice( this );
        }

        if ((Type == ProcessType_TSE) ||
            (Type == ProcessType_DTM) ||
            (Type == ProcessType_ASE) )
        {
             MyNode->addToQuiesceSendPids( GetPid(), GetVerifier() );

             if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL))
                 trace_printf("%s%d: pid %d added to quiesce send list\n", method_name, __LINE__, GetPid());
        }

        if ((Type == ProcessType_TSE) ||
            (Type == ProcessType_ASE) )
        {
             MyNode->addToQuiesceExitPids( GetPid(), GetVerifier() );

             if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL))
                 trace_printf("%s%d: pid %d added to quiesce exit list\n", method_name, __LINE__, GetPid());
        }
    }

    if ( Clone && !preclone )
    {
        StartupCompleted = true;
        if (creation_time != NULL)
            CreationTime = *creation_time;
    }

    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL))
        trace_printf("%s@%d: process %s (%d, %d:%d), preclone=%d"
                     ", clone=%d, StartupCompleted=%d\n",
                     method_name, __LINE__, Name, Nid, os_pid, Verifier, preclone,
                     Clone, StartupCompleted);
    State_ = State_Up;

    // Check if node is shutting down
    if ( !Clone && MyNode->GetState() == State_Shutdown )
    {
        if ( MyNode->GetShutdownLevel() == ShutdownLevel_Abrupt )
        {
            // killing the process will not remove the process object because
            // exit processing will get queued until this completes.
            kill( Pid, SIGKILL );
            if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                trace_printf( "%s@%d - Shutdown abrupt in process, completed kill for %s (%d, %d)\n"
                            , method_name, __LINE__, Name, Nid, os_pid);
        }
        else
        {
            struct message_def *msg;

            msg = new struct message_def;
            msg->type = MsgType_Shutdown;
            msg->noreply = true;
            msg->u.request.type = ReqType_Notice;
            msg->u.request.u.shutdown.nid = Nid;
            msg->u.request.u.shutdown.pid = -1;
            msg->u.request.u.shutdown.level = MyNode->GetShutdownLevel();
            if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
               trace_printf( "%s@%d" " - Sending shutdown notice, level=%d\n"
                           , method_name, __LINE__, MyNode->GetShutdownLevel() );
            // Send shutdown notice
            SQ_theLocalIOToClient->putOnNoticeQueue( Pid, Verifier, msg, NULL );
        }
    }

    // some special handling for native processes
    if ( !Clone )
    {
        ssmpNoticesLock_.lock();
        if ( Type == ProcessType_SSMP && !ssmpNotices_.empty())
        {   // Some death notices are queued for this SSMP process.  Signal
            // the notifier to get to work on delivering them.
            SQ_theLocalIOToClient->nudgeNotifier ();
        }
        ssmpNoticesLock_.unlock();

        if ( Type == ProcessType_SMS )
        {
            // let healthcheck thread know that the SMService process is up and running.
            HealthCheck.setState(HC_UPDATE_SMSERVICE, (long long)this);
        }
        if ( Type == ProcessType_Watchdog )
        {
            // let healthcheck thread know that the watchdog process is up and running.
            HealthCheck.setState(HC_UPDATE_WATCHDOG, (long long)this);
            // start the watchdog timer
            HealthCheck.setState(MON_START_WATCHDOG);
        }
        if ( Type == ProcessType_PSD &&
            (IAmIntegrated || MyNode->IsActivatingSpare() || MyNode->IsSoftNodeDown()) )
        {
             MyNode->StartPStartDPersistent();

             if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_INIT))
                 trace_printf("%s%d: Sent start persistent processes event to PSD process %s (pid=%d)\n", method_name, __LINE__, GetName(), GetPid());
        }
        if ( Type == ProcessType_DTM  &&
             MyNode->IsSoftNodeDown() )
        {
            // Tell remote DTMs that this DTM was restarted
            Monitor->SoftNodeUpPrepare( MyPNID );
        }
    }

    TRACE_EXIT;
}
#endif

void CProcess::CompleteRequest( int status )
{
#ifndef NAMESERVER_PROCESS
    struct message_def *msg;
#endif

    const char method_name[] = "CProcess::CompleteRequest";
    TRACE_ENTRY;

    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
       trace_printf("%s@%d - Process %s (%d,%d:%d), status %d\n",
                    method_name, __LINE__, Name, Nid, Pid, Verifier, status);

#ifndef NAMESERVER_PROCESS
    if ( !Clone )
    {
        msg = parentContext();
        if ( msg )
        { // reply pending, so send reply
            msg->noreply = false;
            msg->u.reply.type = ReplyType_Generic;
            msg->u.reply.u.generic.nid = Nid;
            msg->u.reply.u.generic.pid = Pid;
            msg->u.reply.u.generic.verifier = Verifier;
            msg->u.reply.u.generic.process_name[0] = '\0';
            msg->u.reply.u.generic.return_code = status;

            CRequest::lioreply (msg, Pid);
            parentContext( NULL );
        }
    }
#endif

    TRACE_EXIT;
}

bool CProcess::PickStdfile(PickStdFile_t whichStdfile,
                           char (&Destfile)[MAX_PROCESS_PATH],
                           int &ancestorNid, int &ancestorPid)
{
    const char method_name[] = "CProcess::PickStdfile";
    TRACE_ENTRY;

    CLNode *node = NULL;
    CProcess *ancestor;
    int nextNid = -1;
    int nextPid = 0;

    if (whichStdfile == PICK_STDOUT)
    {
        if (!outfile_.empty())
        {
            STRCPY(Destfile, outfile_.c_str());
            TRACE_EXIT;
            return true;
        }
    }
    else
    {
        if (!infile_.empty())
        {
            STRCPY(Destfile, infile_.c_str());
            TRACE_EXIT;
            return true;
        }
    }

    nextNid = Parent_Nid;
    nextPid = Parent_Pid;
    Destfile[0] = '\0';
    bool retVal = true;

    // Keep track of process creation times to avoid looping forever.
    struct timespec earlyCreationTime;
    earlyCreationTime.tv_sec = CreationTime.tv_sec;
    earlyCreationTime.tv_nsec = CreationTime.tv_nsec;

    while(true)
    {
        node = Nodes->GetLNode (nextNid);
        if (node)
        {
            ancestor = node->GetProcessL(nextPid);
            if ( ancestor  &&
                 (( ! MyNode->IsMyNode(ancestor->GetNid())) ||
                  (ancestor->CreationTime.tv_sec  < earlyCreationTime.tv_sec ||
                   (ancestor->CreationTime.tv_sec == earlyCreationTime.tv_sec  &&
                    ancestor->CreationTime.tv_nsec < earlyCreationTime.tv_nsec))) )
            {
                earlyCreationTime.tv_sec  = ancestor->CreationTime.tv_sec;
                earlyCreationTime.tv_nsec = ancestor->CreationTime.tv_nsec;

                if (whichStdfile == PICK_STDOUT && (ancestor->outfile())[0])
                {
                    // The ancestor specified a standard outfile
                    if ( MyNode->IsMyNode(nextNid) )
                    {   // The ancestor and this process are on the same node
                        STRCPY(Destfile, ancestor->outfile());
                    }
                    else
                    {   // The ancestor is on a different node.
                        ancestorNid = nextNid;
                        ancestorPid = nextPid;
                    }

                    break;
                }
                else if (whichStdfile == PICK_STDIN && (ancestor->infile())[0])
                {
                    // The ancestor specified a standard outfile
                    if ( MyNode->IsMyNode(nextNid) )
                    {   // The ancestor and this process are on the same node
                        STRCPY(Destfile, ancestor->infile());
                    }
                    else
                    {   // The ancestor is on a different node.
                        ancestorNid = nextNid;
                        ancestorPid = nextPid;
                    }

                    break;
                }
                else
                {   // The ancestor process did not specify a stdout file
                    // so next examine ancestor's parent.
                    if (Backup || ancestor->Backup)
                    {
                        nextNid = ancestor->PairParentNid;
                        nextPid = ancestor->PairParentPid;
                    }
                    else
                    {
                        nextNid = ancestor->Parent_Nid;
                        nextPid = ancestor->Parent_Pid;
                    }
                }
            }
            else
            {
                if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION))
                    trace_printf("%s@%d could not find process object for "
                                 "pid=%d\n",
                                 method_name, __LINE__, nextPid);
                retVal = false;
                break;
            }
        }
        else
        {  // Unexpectedly could not find node object
           // log error
            if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION))
                trace_printf("%s@%d could not find node object for nid=%d\n",
                             method_name, __LINE__, nextNid);

            if (nextNid != -1)
            {
                char buf[MON_STRING_BUF_SIZE];
                snprintf(buf, sizeof(buf),
                         "%s, Unable to find node object for nid=%d\n",
                        method_name, nextNid);
                mon_log_write(MON_PROCESS_PICKSTDFILE_2, SQ_LOG_ERR, buf);
            }
            retVal = false;
            break;
        }
    }

    TRACE_EXIT;

    return retVal;
}

// for attached processes,
//   set CreationTime to last modification time of /proc/<pid>/cmdline
// for unattached process,
//   set CreationTime to current time (fork time)
void CProcess::SetCreationTime(int os_pid)
{
    if (os_pid == -1)
    {
        struct timespec ts;
        int err = clock_gettime(CLOCK_REALTIME, &ts);
        if (err == 0)
            CreationTime = ts;
    } else
    {
        char statline[40];
        struct stat statbuf;
        snprintf(statline, sizeof(statline), "/proc/%d/cmdline", os_pid);
        int err = stat(statline, &statbuf);
        if (err == 0)
            CreationTime = statbuf.st_mtim;
    }
}

void CProcess::SetVerifier()
{
   Verifier = Monitor->incrGetVerifierNum();
   return;
}

#ifndef NAMESERVER_PROCESS
void CProcess::SetupFifo(int attachee_nid, int attachee_pid)
{
    const char method_name[] = "CProcess::SetupFifo";
    TRACE_ENTRY;

    // reset umask (group needs write permissions for fifo)
    mode_t prev_mask;
    prev_mask = umask(S_IWOTH);


    // Get the file name for the attached process's current standard in file
    char std_name[MAX_PROCESS_PATH];
    char filepath[30];
    ssize_t std_name_len;
    snprintf (filepath, sizeof(filepath), "/proc/%d/fd/0", attachee_pid);
    std_name_len = readlink (filepath, std_name, MAX_PROCESS_PATH-1);
    if (std_name_len < 0) std_name_len = 0;
    std_name[std_name_len] = '\0';

    if ((std_name_len >= 9)
     && (strcmp(&std_name[std_name_len-9], "(deleted)") != 0))
    {
        // Record the infile name in the process object
        infile_ = std_name;
    }
    else if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION))
        trace_printf("%s@%d Not saving stdin file %s for pid=%d\n",
                     method_name, __LINE__, std_name, attachee_pid);

    // Get the file name for the attached process's current standard out file
    snprintf (filepath, sizeof(filepath), "/proc/%d/fd/1", attachee_pid);
    std_name_len = readlink (filepath, std_name, MAX_PROCESS_PATH-1);
    if (std_name_len < 0) std_name_len = 0;
    std_name[std_name_len] = '\0';

    // Record the outfile name in the process object.  Any child
    // process created by it may write to the pipe.
    if (strncmp(std_name, "pipe:", 5) != 0)
    {   // The attach process has a device or file for its standard output.
        outfile_ = std_name;
    }
    else
    {   // The attached process has a pipe for its standard output.
        outfile_ = filepath;
    }

    // Create unique fifo name, store in process object
    bool fifo_ok = true;
    char fifo_stdout[50];
    strcpy(fifo_stdout, "/tmp/sqmp.XXXXXX");
    int fifo_stdout_fd = mkstemp(fifo_stdout);
    if (fifo_stdout_fd == -1)
    {   // Unexpected mkstemp problem
        char buf[MON_STRING_BUF_SIZE];
        snprintf(buf, sizeof(buf), "[%s], mkstemp(%s) error, %s.\n", method_name,
                fifo_stdout, strerror(errno));
        mon_log_write(MON_PROCESS_SETUPFIFO_1, SQ_LOG_ERR, buf);
        fifo_ok = false;
    }
    if (fifo_ok)
    {
        fifo_stdout_ = fifo_stdout;
        // unlink so mkfifo works
        int err = unlink(fifo_stdout);
        if (err == -1)
        {   // Unexpected unlink problem
            char buf[MON_STRING_BUF_SIZE];
            snprintf(buf, sizeof(buf), "[%s], unlink(%s) error, %s.\n", method_name,
                    fifo_stdout, strerror(errno));
            mon_log_write(MON_PROCESS_SETUPFIFO_2, SQ_LOG_ERR, buf);
            fifo_ok = false;
        }
    }
    if (fifo_ok)
    {
        if (mkfifo(fifo_stdout, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP))
        {   // Unexpected fifo creation problem
            char buf[MON_STRING_BUF_SIZE];
            snprintf(buf, sizeof(buf), "[%s], mkfifo(%s) error, %s.\n", method_name,
                    fifo_stdout, strerror(errno));
            mon_log_write(MON_PROCESS_SETUPFIFO_3, SQ_LOG_ERR, buf);
        }
        else
        {
            // Open the fifo for reading.  Use non-blocking mode because
            // otherwise open would not complete until attached process
            // opens fifo for writing.
            fd_stdout_ = open (fifo_stdout, O_RDONLY | O_NONBLOCK);
            if (fd_stdout_ == -1)
            {
                char buf[MON_STRING_BUF_SIZE];
                snprintf(buf, sizeof(buf),
                         "[%s], fifo open(%s) error, %s.\n", method_name,
                        fifo_stdout, strerror(errno));
                mon_log_write(MON_PROCESS_SETUPFIFO_4, SQ_LOG_ERR, buf);
            }
            else
            {
                // close the unlinked file
                close(fifo_stdout_fd);
            }
        }

#ifndef NAMESERVER_PROCESS
        Redirector.stdoutFd(attachee_nid, attachee_pid, fd_stdout_, outfile_.c_str(),
                            -1, -1);
#endif
    }

    // Create unique stderr fifo name, store in process object
    fifo_ok = true;
    char fifo_stderr[50];
    strcpy(fifo_stderr, "/tmp/sqmp.XXXXXX");
    int fifo_stderr_fd = mkstemp(fifo_stderr);
    if (fifo_stderr_fd == -1)
    {   // Unexpected mkstemp problem
        char buf[MON_STRING_BUF_SIZE];
        snprintf(buf, sizeof(buf), "[%s], mkstemp(%s) error, %s.\n", method_name,
                fifo_stderr, strerror(errno));
        mon_log_write(MON_PROCESS_SETUPFIFO_5, SQ_LOG_ERR, buf);
        fifo_ok = false;
    }

    if (fifo_ok)
    {
        fifo_stderr_ = fifo_stderr;
        // unlink so mkfifo works
        int err = unlink(fifo_stderr);
        if (err == -1)
        {   // Unexpected unlink problem
            char buf[MON_STRING_BUF_SIZE];
            snprintf(buf, sizeof(buf), "[%s], unlink(%s) error, %s.\n", method_name,
                    fifo_stderr, strerror(errno));
            mon_log_write(MON_PROCESS_SETUPFIFO_6, SQ_LOG_ERR, buf);
            fifo_ok = false;
        }
    }
    if (fifo_ok)
    {
        if (mkfifo(fifo_stderr, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP))
        {
            char buf[MON_STRING_BUF_SIZE];
            snprintf(buf, sizeof(buf),
                     "[%s], mkfifo(%s) error, %s.\n", method_name, fifo_stderr,
                    strerror(errno));
            mon_log_write(MON_PROCESS_SETUPFIFO_7, SQ_LOG_ERR, buf);
        }
        else
        {
            fd_stderr_ = open (fifo_stderr, O_RDONLY | O_NONBLOCK);
            if (fd_stderr_ == -1)
            {
                char buf[MON_STRING_BUF_SIZE];
                snprintf(buf, sizeof(buf),
                         "[%s], fifo open(%s) error, %s.\n", method_name,
                        fifo_stderr, strerror(errno));
                mon_log_write(MON_PROCESS_SETUPFIFO_8, SQ_LOG_ERR, buf);
            }
            else
            {
                // close the unlinked file
                close(fifo_stderr_fd);
            }

#ifndef NAMESERVER_PROCESS
            Redirector.stderrFd(MyNode->GetHostname(), Name, Nid, attachee_pid, fd_stderr_);
#endif
        }
    }

    if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION))
        trace_printf("%s@%d Process=%s, Pid=%d, Infile=[%s], "
                     "Outfile=[%s], fifo_stdout=%s, fd_stdout=%d, "
                     "fifo_stderr=%s, fd_stderr=%d\n",
                     method_name, __LINE__, Name, attachee_pid,
                     infile_.c_str(), outfile_.c_str(), fifo_stdout_.c_str(),
                     fd_stdout_, fifo_stderr_.c_str(), fd_stderr_);

    // Restore previous umask
    umask(prev_mask);

    TRACE_EXIT;
}
#endif

// LCOV_EXCL_START
// Methods CProcess::SetupPipe and CProcess::RedirectStdFiles are
// excluded from code coverage measurement.   They are executed only
// by a monitor child process not the monitor process itself.  Therefore
// they do not show up as covered lines when monitor code coverage measurement
// is done.


#ifndef NAMESERVER_PROCESS
void CProcess::SetupPipe(int orig_fd, int unused_pipe_fd, int pipe_fd)
{
    int newfd;
    char buf[MON_STRING_BUF_SIZE];

    const char method_name[] = "CProcess::SetupPipe";
    TRACE_ENTRY;

    // Close original file descriptor
    if (close(orig_fd))
    {
        snprintf(buf, sizeof(buf), "[%s], close(%d) error, %s.\n",
                 method_name, orig_fd, strerror(errno));
        mon_log_write(MON_PROCESS_SETUPPIPE_1, SQ_LOG_ERR, buf);
    }

    // Close unused pipe file descriptor
    if (close(unused_pipe_fd))
    {
        snprintf(buf, sizeof(buf), "[%s], close(%d) error, %s.\n", method_name,
                unused_pipe_fd, strerror(errno));
        mon_log_write(MON_PROCESS_SETUPPIPE_2, SQ_LOG_ERR, buf);
    }

    // Duplicate pipe file desciptor to original file descriptor number
    newfd = dup2(pipe_fd, orig_fd);
    if (newfd == -1)
    {
        snprintf(buf, sizeof(buf), "[%s], dup2(%d, %d) error, %s.\n",
                 method_name, pipe_fd, orig_fd, strerror(errno));
        mon_log_write(MON_PROCESS_SETUPPIPE_3, SQ_LOG_ERR, buf);
    }

    // Close the pipe file descriptor
    if (close(pipe_fd))
    {
        snprintf(buf, sizeof(buf), "[%s], close(%d) error, %s.\n", method_name,
                pipe_fd, strerror(errno));
        mon_log_write(MON_PROCESS_SETUPPIPE_4, SQ_LOG_ERR, buf);
    }

    TRACE_EXIT;
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcess::RedirectStdFiles(int pfds_stdin[2], int pfds_stdout[2],
                                int pfds_stderr[2])

{
    const char method_name[] = "CProcess::RedirectStdFiles";
    TRACE_ENTRY;

    SetupPipe(0, pfds_stdin[1], pfds_stdin[0]);

    SetupPipe(1, pfds_stdout[0], pfds_stdout[1]);

    SetupPipe(2, pfds_stderr[0], pfds_stderr[1]);

    TRACE_EXIT;
}
#endif

// LCOV_EXCL_STOP

void CProcess::setEnvStr ( char **envp, int &countEnv, const char *str )
{
    envp[countEnv] = new char [ strlen(str)+1 ];
    strcpy ( envp[countEnv], str );
    ++countEnv;
}

void CProcess::setEnvStrVal ( char **envp, int &countEnv, const char *str,
                    const char *val)
{
    envp[countEnv] = new char [ strlen(str)+strlen(val)+2 ];
    sprintf ( envp[countEnv], "%s=%s", str, val );
    ++countEnv;
}

void CProcess::setEnvIntVal ( char **envp, int &countEnv, const char *str,
                              int val)
{
    envp[countEnv] = new char [ strlen(str)+13 ];
    sprintf ( envp[countEnv], "%s=%d", str, val );
    ++countEnv;
}

void CProcess::setEnvRegGroupVals(CConfigGroup *group, char **envp,
                                  int &countEnv)
{
    CConfigKey *key;

    const char method_name[] = "CProcess::setEnvRegGroupVals";
    TRACE_ENTRY;

    if (group)
    {
        key = group->GetKey((char *) "");
        while (key)
        {
            if (strncasecmp(key->GetName(), "~US_", 4) != 0)
            {  // Not an internal monitor unique string, ok to set
                setEnvStrVal(envp, countEnv, key->GetName(), key->GetValue());
            }
            if (countEnv >= MAX_CHILD_ENV_VARS)
            {
                break;
            }

            key = key->GetNext();
        }
    }
    TRACE_EXIT;
}

void CProcess::setEnvFromRegistry ( char **envp, int &countEnv )
{
    CConfigGroup *group;

    group = Config->GetClusterGroup();
    setEnvRegGroupVals ( group, envp, countEnv );

    group = Config->GetLocalNodeGroup();
    setEnvRegGroupVals ( group, envp, countEnv );

    group = Config->GetGroup(Name);
    setEnvRegGroupVals ( group, envp, countEnv );
}

#ifndef NAMESERVER_PROCESS
bool CProcess::Create (CProcess *parent, void* tag, int & result)
{
    bool monAltLogEnabled = false;
    bool seamonsterEnabled = false;
    bool shellTrace = false;
    bool successful = false;
    bool wdtDumpMonitor = false;
    bool wdtTraceCmd = false;
    bool wdtTraceInit = false;
    bool wdtTraceLio = false;
    bool wdtTraceEntryExit = false;
    bool wdtKeepAliveTimer = false;
    bool wdtMonProcRate = false;
    bool wdtLunmgrHangDelay = false;
    bool wdtLinuxWatchdog = false;
    bool wdtStartupTimer = false;
    int numProcessThreads = 0;
    int keepAliveValue = 0;
    int monitorCheckRateValue = 0;
    int lunmgrHangDelayValue = 0;
    int startupTimerValue = 0;
    int i;
    int j;
    int rc = -1;
    int rc2 = -1;
    char *env;
    char **argv;
    char *childEnv[MAX_CHILD_ENV_VARS + 1];
    int nextEnv = 0;
    int maxClientBuffers = SQ_LIO_MAX_BUFFERS;

    char la_buf[MON_STRING_BUF_SIZE];

    const char method_name[] = "CProcess::Create";
    TRACE_ENTRY;

    result = MPI_SUCCESS;

    if (Debug)
    {
        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
           trace_printf("%s@%d" " - Starting process through debugger" "\n", method_name, __LINE__);
    }

    pid_t os_pid;
    char sonardir[MAX_PROCESS_PATH];
    char user[50];
    char filename[MAX_PROCESS_PATH];
    char home[MAX_PROCESS_PATH];
    char mpiroot[MAX_PROCESS_PATH];
    char mpitmpdir[MAX_PROCESS_PATH];
    char mpiflags[20];
    char mpi_ic_order[10];
    char mpi_test_delay[10];
    char mpi_error_level[10];
    char sq_ic[5];
    char term[20];
    char tz[100];
    bool tz_exists;
    char xauthority[MAX_PROCESS_PATH];
    char *display;
    char *vnodes;
    char nsCommPort[10];
    char nsSyncPort[10];
    char nsMon2NsPort[10];
    char nsConfigDb[MAX_PROCESS_PATH];
    MON_Props xprops(true);
    MON_Props xprops_exe(true);
    char *xprops_exe_file;

    // get last used default environment
    env = getenv ("TERM");
    STRCPY (term, (env?env:"ansi"));
    env = getenv ("TZ");
    tz_exists = (env != NULL);
    if (tz_exists)
    {
      STRCPY (tz, env); // see note regarding TZ below
    }
    env = getenv ("USER");
    STRCPY (user, (env?env:""));
    env = getenv ("HOME");
    STRCPY (home, (env?env:""));
    env = getenv ("SONAR_ROOT");
    STRCPY (sonardir, (env?env:""));
    env = getenv ("MPI_ROOT");
    STRCPY (mpiroot, (env?env:""));
    env = getenv ("MPI_TMPDIR");
    STRCPY (mpitmpdir, (env?env:home));
   // strcpy (mpiflags, "l,y0,Eon");
    strcpy (mpiflags, "y0");
    if (Debug)
    {
        strcat(mpiflags,",egdb");
    }
    env = getenv ("MPI_TEST_DELAY");
    STRCPY(mpi_test_delay,(env?env:"2"));
    env = getenv ("MPI_ERROR_LEVEL");
    strcpy(mpi_error_level,(env?env:"2"));
    STRCPY (xauthority, home);
    strcat (xauthority, "/.Xauthority");
    display = getenv ("DISPLAY");
    vnodes = getenv("SQ_VIRTUAL_NODES");
    env=getenv("SQ_IC");
    if(env)
    {
        if ((strcmp(env,"IBV")==0) || (strcmp(env,"-IBV")==0))
        {
            strcpy(sq_ic, "-IBV");
            strcpy(mpi_ic_order, "IBV");
        }
        else
        {
            strcpy(sq_ic, "-TCP");
            strcpy(mpi_ic_order, "TCP");
        }
    }
    else
    {
        strcpy(sq_ic, "-TCP");
        strcpy(mpi_ic_order, "TCP");
    }

    env = getenv( "SQ_LIO_MAX_BUFFERS" );
    if (env)
    {
        maxClientBuffers = atoi( env );
    }

    env = getenv( "SQ_LOCAL_IO_SHELL_TRACE" );
    if (env && strcmp( env, "1" ) == 0)
       shellTrace = true;

    if ( Type == ProcessType_NameServer )
    {
        env = getenv ("NS_COMM_PORT");
        STRCPY (nsCommPort, (env?env:""));
        env = getenv ("NS_SYNC_PORT");
        STRCPY (nsSyncPort, (env?env:""));
        env = getenv ("NS_M2N_COMM_PORT");
        STRCPY (nsMon2NsPort, (env?env:""));
        env = getenv ("SQ_CONFIGDB");
        STRCPY (nsConfigDb, (env?env:""));
    }
    if ( Type == ProcessType_Watchdog )
    {
        env = getenv( "WDT_TRACE_CMD" );
        if (env && strcmp( env, "1" ) == 0)
           wdtTraceCmd = true;
        env = getenv( "WDT_TRACE_INIT" );
        if (env && strcmp( env, "1" ) == 0)
           wdtTraceInit = true;
        env = getenv( "WDT_TRACE_LIO" );
        if (env && strcmp( env, "1" ) == 0)
           wdtTraceLio = true;
        env = getenv( "WDT_TRACE_ENTRY_EXIT" );
        if (env && strcmp( env, "1" ) == 0)
           wdtTraceEntryExit = true;
        env = getenv( "SQ_WDT_KEEPALIVETIMERVALUE" );
        if (env && isdigit(*env))
        {
            wdtKeepAliveTimer = true;
            keepAliveValue = atoi(env);
        }
        env = getenv( "SQ_WDT_MONITOR_PROCESS_CHECKRATE" );
        if (env && isdigit(*env))
        {
            wdtMonProcRate = true;
            monitorCheckRateValue = atoi(env);
        }
        env = getenv( "SQ_WDT_LUNMGR_PROCESS_HANGDELAY" );
        if (env && isdigit(*env))
        {
            wdtLunmgrHangDelay = true;
            lunmgrHangDelayValue = atoi(env);
        }
        env = getenv( "SQ_LINUX_WATCHDOG" );
        if (env && strcmp( env, "1" ) == 0)
           wdtLinuxWatchdog = true;
        env = getenv( "SQ_WDT_STARTUPTIMERVALUE" );
        if (env && isdigit(*env))
        {
            wdtStartupTimer = true;
            startupTimerValue = atoi(env);
        }
        env = getenv( "SQ_WDT_DUMP_MONITOR" );
        if (env && strcmp( env, "1" ) == 0)
           wdtDumpMonitor = true;
    }

    env = getenv( "SQ_MON_ALTLOG" );
    if (env && strcmp( env, "1" ) == 0)
       monAltLogEnabled = true;

    env = getenv( "SQ_SEAMONSTER" );
    if (env && strcmp( env, "1" ) == 0)
       seamonsterEnabled = true;

    env = getenv( "SQ_LIO_PROCESS_THREADS" );
    if (env && isdigit(*env))
       numProcessThreads = atoi(env);

    env = getenv( "TRAF_CONF" );
    if (env)
    {
        trafConf_ = env ;
    }
    env = getenv( "TRAF_HOME" );
    if (env)
    {
        trafHome_ = env ;
    }
    env = getenv( "TRAF_VAR" );
    if (env)
    {
        trafVar_ = env ;
    }

    // setup default environment variables from monitor or last CreateProcess call
    if (maxClientBuffers)
    {
        setEnvIntVal ( childEnv, nextEnv, "SQ_LIO_MAX_BUFFERS", maxClientBuffers );
    }
    if (numProcessThreads)
    {
        setEnvIntVal ( childEnv, nextEnv, "SQ_LIO_PROCESS_THREADS",
                       numProcessThreads );
    }
    if (shellTrace)
    {
        setEnvStr ( childEnv, nextEnv, "SQ_LOCAL_IO_SHELL_TRACE=1" );
    }

    setEnvStrVal ( childEnv, nextEnv, "MPI_ROOT", mpiroot );

    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
       trace_printf("%s@%d - MPI_ROOT = %s\n", method_name, __LINE__, mpiroot);

    setEnvStrVal ( childEnv, nextEnv, "MPI_TMPDIR", mpitmpdir );

    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
       trace_printf("%s@%d - MPI_TMPDIR=%s\n", method_name, __LINE__,
                    mpitmpdir);

    setEnvStrVal ( childEnv, nextEnv, "MPI_FLAGS", mpiflags );

    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
       trace_printf("%s@%d - MPI_FLAGS=%s\n", method_name, __LINE__, mpiflags);

    setEnvStrVal ( childEnv, nextEnv, "MPI_IC_ORDER", mpi_ic_order );

    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
       trace_printf("%s@%d - MPI_IC_ORDER=%s\n", method_name, __LINE__,
                    mpi_ic_order);

    setEnvStrVal ( childEnv, nextEnv, "MPI_TEST_DELAY", mpi_test_delay );

    setEnvStrVal ( childEnv, nextEnv, "MPI_ERROR_LEVEL", mpi_error_level );

    setEnvStr ( childEnv, nextEnv, "MPI_RDMA_MSGSIZE=32768,131072,4194304" );

    setEnvStr ( childEnv, nextEnv, "HPMP_SQ=1" );

    setEnvStr ( childEnv, nextEnv, "MALLOC_ARENA_MAX=1" );

    setEnvStr ( childEnv, nextEnv, "HPMP_SINGLETON_HA=1" );

    if ( strcmp( mpi_ic_order, "IBV" ) == 0 )
    {
        setEnvStr ( childEnv, nextEnv, "MPI_HASIC_IBV=1" );
    }

    if ( Measure == 1 )
    {
        snprintf(filename,sizeof(filename),"%s/%s", mpitmpdir, Name);
        setEnvStrVal ( childEnv, nextEnv, "MPI_INSTR", filename );
    }
    else if ( Measure == 2 )
    {
        snprintf(filename,sizeof(filename),"%s/%s.cpu:cpu", mpitmpdir, Name);
        setEnvStrVal ( childEnv, nextEnv, "MPI_INSTR", filename );
    }

    setEnvStrVal ( childEnv, nextEnv, "TRAF_CONF", trafConf_.c_str() );
    setEnvStrVal ( childEnv, nextEnv, "TRAF_HOME", trafHome_.c_str() );
    setEnvStrVal ( childEnv, nextEnv, "TRAF_VAR", trafVar_.c_str() );
    setEnvStrVal ( childEnv, nextEnv, "USER", user );
    setEnvStrVal ( childEnv, nextEnv, "HOME", home );
    setEnvStrVal ( childEnv, nextEnv, "TERM", term );
    if (tz_exists)
    {
      // Note that if TZ does not exist, we don't want to set it.
      // The absence of TZ causes the glib localtime function to
      // use the local time as defined in /etc/localtime. But,
      // an invalid TZ setting (such as the empty string) causes
      // the localtime function to use UTC. So, the semantics of
      // an unset TZ are not the same as the semantics of
      // TZ=<empty string>.
      setEnvStrVal ( childEnv, nextEnv, "TZ", tz );
    }
    setEnvStrVal ( childEnv, nextEnv, "CLASSPATH", getenv("CLASSPATH"));

    if ( display )
    {
        setEnvStrVal ( childEnv, nextEnv, "DISPLAY", display );
    }
    setEnvStrVal ( childEnv, nextEnv, "XAUTHORITY", xauthority );
    setEnvStrVal ( childEnv, nextEnv, "SQ_IC", sq_ic );
    if ( vnodes && *vnodes )
    {
        setEnvStrVal ( childEnv, nextEnv, "SQ_VIRTUAL_NODES", vnodes );
        setEnvIntVal ( childEnv, nextEnv, "SQ_VIRTUAL_NID", MyPNID );
        setEnvIntVal ( childEnv, nextEnv, "SQ_LIO_VIRTUAL_NID", MyPNID );
    }

    if ( Type == ProcessType_NameServer )
    {
        setEnvStr ( childEnv, nextEnv, "SQ_MON_CREATOR=MPIRUN" );
        setEnvStr ( childEnv, nextEnv, "SQ_MON_RUN_MODE=AGENT" );
        if ( nsCommPort[0] )
            setEnvStrVal ( childEnv, nextEnv, "NS_COMM_PORT", nsCommPort );
        if ( nsSyncPort[0] )
            setEnvStrVal ( childEnv, nextEnv, "NS_SYNC_PORT", nsSyncPort );
        if ( nsMon2NsPort[0] )
            setEnvStrVal ( childEnv, nextEnv, "NS_M2N_COMM_PORT", nsMon2NsPort );
        if (nsConfigDb[0] )
            setEnvStrVal ( childEnv, nextEnv, "SQ_CONFIGDB", nsConfigDb );
    }
    if ( Type == ProcessType_Watchdog )
    {
        if ( wdtTraceCmd )
        {
            setEnvStr ( childEnv, nextEnv, "WDT_TRACE_CMD=1" );
        }
        if ( wdtTraceInit )
        {
            setEnvStr ( childEnv, nextEnv, "WDT_TRACE_INIT=1" );
        }
        if ( wdtTraceLio )
        {
            setEnvStr ( childEnv, nextEnv, "WDT_TRACE_LIO=1" );
        }
        if ( wdtTraceEntryExit )
        {
            setEnvStr ( childEnv, nextEnv, "WDT_TRACE_ENTRY_EXIT=1" );
        }
        if ( wdtKeepAliveTimer )
        {
            setEnvIntVal ( childEnv, nextEnv, "SQ_WDT_KEEPALIVETIMERVALUE", keepAliveValue );
        }
        if ( wdtMonProcRate )
        {
            setEnvIntVal ( childEnv, nextEnv, "SQ_WDT_MONITOR_PROCESS_CHECKRATE", monitorCheckRateValue );
        }
        if ( wdtLunmgrHangDelay )
        {
            setEnvIntVal ( childEnv, nextEnv, "SQ_WDT_LUNMGR_PROCESS_HANGDELAY", lunmgrHangDelayValue );
        }
        if ( wdtLinuxWatchdog )
        {
            setEnvStr ( childEnv, nextEnv, "SQ_LINUX_WATCHDOG=1" );
        }
        if ( wdtStartupTimer )
        {
            setEnvIntVal ( childEnv, nextEnv, "SQ_WDT_STARTUPTIMERVALUE", startupTimerValue );
        }
        if ( wdtDumpMonitor )
        {
            setEnvStr ( childEnv, nextEnv, "SQ_WDT_DUMP_MONITOR=1" );
        }
        if ( monAltLogEnabled )
        {
            setEnvStr ( childEnv, nextEnv, "SQ_MON_ALTLOG=1" );
        }
    }
    if ( Type == ProcessType_PSD || Type == ProcessType_SMS )
    {
        if ( monAltLogEnabled )
        {
            setEnvStr ( childEnv, nextEnv, "SQ_MON_ALTLOG=1" );
        }
    }
    if ( seamonsterEnabled )
    {
        setEnvStr ( childEnv, nextEnv, "SQ_SEAMONSTER=1" );
    }

    string LDpath;
    static bool sv_getenv_ld_library_path_done = false;
    static string sv_ld_library_path;
    if (IsAgentMode)
    {
        if (! sv_getenv_ld_library_path_done)
        {
            sv_getenv_ld_library_path_done = true;
            sv_ld_library_path = getenv( "LD_LIBRARY_PATH" );
            if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
            {
                trace_printf( "%s@%d" " - LD_LIBRARY_PATH = " "%s" "\n", method_name, __LINE__, sv_ld_library_path.c_str() );
            }
        }
        LDpath = sv_ld_library_path;
        if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
        {
            trace_printf( "%s@%d" " - LD_LIBRARY_PATH = " "%s" "\n", method_name, __LINE__, LDpath.c_str() );
        }
    }
    else
    {
        if (ldpathStrId_.nid != -1)
        {
            Config->strIdToString( ldpathStrId_, LDpath );
        }
    }
    if (!LDpath.empty())
    {
        setEnvStrVal( childEnv, nextEnv, "LD_LIBRARY_PATH", LDpath.c_str( ) );
        if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
        {
            trace_printf( "%s@%d - LD_LIBRARY_PATH = %s\n", method_name, __LINE__, LDpath.c_str() );
        }
    }

    setEnvStr ( childEnv, nextEnv, "LD_BIND_NOW=true" );

    string program;
    Config->strIdToString ( programStrId_, program );
    // temp for performance investigation
    if ( strstr(program.c_str(), "tdm_arkcmp") != NULL
      || strstr(program.c_str(), "tdm_arkesp") != NULL )
    {
        cmpOrEsp_ = true;
    }
    // Save actual program filename and set PWD environment variable
    size_t lastSlash = program.rfind('/');
    if (lastSlash == string::npos)
    {   // At top level directory
        STRCPY(filename, program.c_str());
    }
    else
    {
        STRCPY(filename, &program[lastSlash+1]);
    }
    if (lastSlash == string::npos || lastSlash == 0)
    {
        setEnvStr ( childEnv, nextEnv, "PWD=/" );

        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
            trace_printf("%s@%d - PWD=/\n", method_name, __LINE__);
    }
    else
    {
        string pwd = program.substr(0, lastSlash);
        setEnvStrVal ( childEnv, nextEnv, "PWD", pwd.c_str() );
        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
            trace_printf("%s@%d - PWD=%s\n", method_name, __LINE__,
                         pwd.c_str());
    }

    string path;
    static bool sv_getenv_path_done = false;
    static string sv_path;
    if (IsAgentMode)
    {
        if (! sv_getenv_path_done)
        {
            sv_getenv_path_done = true;
            sv_path = getenv( "PATH" );
            if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
            {
                trace_printf( "%s@%d" " - PATH = " "%s" "\n", method_name, __LINE__, sv_path.c_str() );
            }
        }
        path = sv_path;
        if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
        {
            trace_printf( "%s@%d" " - PATH = " "%s" "\n", method_name, __LINE__, path.c_str() );
        }
    }
    else
    {
        if (pathStrId_.nid != -1)
        {
            Config->strIdToString( pathStrId_, path );
        }
    }
    setEnvStrVal( childEnv, nextEnv, "PATH", path.c_str( ) );
    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
    {
        trace_printf( "%s@%d" " - PATH = " "%s" "\n", method_name, __LINE__, path.c_str() );
    }

    // Set values from registry as environment variables
    setEnvFromRegistry ( childEnv, nextEnv );

    xprops_exe_file = NULL;
    xprops.load("mon.env");
    MON_Smap_Enum xenum(&xprops);
    if (xenum.more())
    {
        snprintf(la_buf, sizeof(la_buf),
                 "[CProcess::Create], Warning: using mon.env.\n");
    }
    while (xenum.more())
    {
        char *xkey = xenum.next();
        const char *xvalue = xprops.get(xkey);
        if (memcmp(xkey, "SQ_PROPS_", 9) == 0)
        {
            if (strcasecmp(&xkey[9], filename) == 0)
                xprops_exe_file = (char *) xvalue;
        }
        setEnvStrVal ( childEnv, nextEnv, xkey, xvalue );
        if (nextEnv > MAX_CHILD_ENV_VARS)
        {   // Exceeded array size
            nextEnv = MAX_CHILD_ENV_VARS;
            break;
        }
        if (trace_settings
            & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
            trace_printf("%s@%d - mon.env %s=%s\n", method_name, __LINE__, xkey,
                         xvalue );
    }
    if (xprops_exe_file != NULL)
    {
        // load exe-property-file
        xprops_exe.load(xprops_exe_file);
        MON_Smap_Enum xenum(&xprops_exe);
        while (xenum.more())
        {
            char *xkey = xenum.next();
            const char *xvalue = xprops_exe.get(xkey);
            setEnvStrVal ( childEnv, nextEnv, xkey, xvalue );
            if (nextEnv > MAX_CHILD_ENV_VARS)
            {   // Exceeded array size
                nextEnv = MAX_CHILD_ENV_VARS;
                break;
            }
            if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                trace_printf("%s@%d - %s %s=%s\n", method_name, __LINE__, xprops_exe_file, xkey, xvalue);
        }
    }
    // Add environment array terminator required by execve.
    childEnv[nextEnv] = NULL;
    ++nextEnv;

    if ( !SMSIntegrating && Type == ProcessType_SMS && !Clone && !argc_ )
    {
        argv = new char *[13];
    }
    else
    {
        argv = new char *[argc_ + 13];
    }
    argv[0] = new char [strlen(filename)+1];
    strcpy(argv[0], filename);
    j = 1;

    // finish setting up arguments for process after <filename> in argv[0]
    // "SQMON1.0" <pnid> <nid> <pid> <pname> <port> <ptype> <zid> <verifier> "SPARE"
    //     [1]     [2]    [3]   [4]    [5]    [6]    [7]     [8]      [9]     [10]
    argv[j] = new char[9];
    sprintf (argv[j], "SQMON1.1");

    argv[j + 1] = new char[6];
    sprintf (argv[j + 1], "%5.5d", MyPNID);

    argv[j + 2] = new char[6];
    sprintf (argv[j + 2], "%5.5d", Nid);

    argv[j + 3] = new char[7];
    //sprintf (argv[j + 3], "%6.6d", Pid);
    strcpy(argv[j + 3],"??????"); // The Pid will be assigned later, but we can't print it then.

    argv[j + 4] = new char[strlen(Name) ? strlen(Name)+1 : MAX_PROCESS_NAME_STR];
    strcpy (argv[j + 4], Name);

    argv[j + 5] = new char[strlen (MyCommPort) + 1];
    strcpy (argv[j + 5], MyCommPort);

    argv[j + 6] = new char[6];
    sprintf (argv[j + 6], "%5.5d", Type);

    argv[j + 7] = new char[6];
    sprintf (argv[j + 7], "%5.5d", MyNode->GetZone());

    SetVerifier(); // CProcess::Create
    argv[j + 8] = new char[6];
    sprintf (argv[j + 8], "%5.5d", Verifier);

    argv[j + 9] = new char[6];
    sprintf (argv[j + 9], "SPARE");

    if ( !SMSIntegrating && Type == ProcessType_SMS && !Clone && !argc_ )
    {
        argc_ = 1;
        argv[j + 10] = new char[7];
        sprintf (argv[j + 10], "sminit");
        argv[j + 11] = NULL;
    }
    else
    {
        // now append user args
        const char *pUserArgv = userArgv_;
        int arglen;
        for (i = 0; i < argc_; i++)
        {
            arglen = strlen (pUserArgv) + 1;
            argv[i + j + 10] = new char[arglen];
            strcpy (argv[i + j + 10], pUserArgv);
            pUserArgv += arglen;
        }
        argv[i + j + 10] = NULL;
    }

    // start process and place in list
    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
    {
        trace_printf("%s@%d" " - Program='" "%s" "' argc=" "%d" "\n", method_name, __LINE__, program.c_str(), argc_ + j + 10);
        i = 0;
        while (argv[i] != NULL)
        {
            trace_printf("%s@%d" " - argv[" "%d" "]="  "%s" "\n", method_name, __LINE__, i, argv[i]);
            i++;
        }
    }

    // Create pipes for inter-process communication between new process
    // and the monitor.
    int pfds_stdin[2];
    if (pipe (pfds_stdin))
    {  // Error creating pipe
        snprintf(la_buf, sizeof(la_buf), "[%s], stdin pipe error, %s.\n",
                 method_name, strerror(errno));
        mon_log_write(MON_PROCESS_CREATE_1, SQ_LOG_ERR, la_buf);
        pfds_stdin[0] = -1;
        pfds_stdin[1] = -1;
    }

    int pfds_stdout[2];
    if (pipe (pfds_stdout))
    {  // Error creating pipe
        snprintf(la_buf, sizeof(la_buf), "[%s], stdout pipe error, %s.\n",
                 method_name, strerror(errno));
        mon_log_write(MON_PROCESS_CREATE_2, SQ_LOG_ERR, la_buf);
        pfds_stdout[0] = -1;
        pfds_stdout[1] = -1;
    }

    int pfds_stderr[2];
    if (pipe (pfds_stderr))
    {  // Error creating pipe
        snprintf(la_buf, sizeof(la_buf), "[%s], stderr pipe error, %s.\n",
                 method_name, strerror(errno));
        mon_log_write(MON_PROCESS_CREATE_3, SQ_LOG_ERR, la_buf);
        pfds_stderr[0] = -1;
        pfds_stderr[1] = -1;
    }

    MemModLock.lock();

    // make all child variable accessed only from heap
    int priority = Priority;

#ifdef USE_FORK_SUSPEND_RESUME
    mon_thread_suspend_all();
#endif // USE_FORK_SUSPEND_RESUME

    sigset_t forkMask;
    sigset_t oldMask;
    sigemptyset(&forkMask);
    sigaddset(&forkMask, SIGCHLD);
    rc = pthread_sigmask(SIG_BLOCK, &forkMask, &oldMask);
    if (rc != 0)
    {
        snprintf(la_buf, sizeof(la_buf),
                 "[%s], pthread_sigmask() error: %s (%d)\n",
                 method_name, strerror(rc), rc );
        mon_log_write(MON_PROCESS_CREATE_4, SQ_LOG_ERR, la_buf);
    }

    // this pipe is used to tell the child to go away if monitor detects
    // a duplicate pid. This can occur if there is a pending child death signal.
    int pipefd[2];
    pipe(pipefd);
    bool childGoAway = false;

    SetCreationTime(-1);
    os_pid = fork ();
    if (os_pid == -1)
    {
        // can't start a process
        rc = result = MPI_ERR_SPAWN;
    }
    else if (os_pid)
    {
        // I am monitor

        rc = pthread_sigmask(SIG_SETMASK, &oldMask, NULL);
        if (rc != 0)
        {
            snprintf(la_buf, sizeof(la_buf),
                     "[%s], pthread_sigmask() error: %s (%d)\n",
                     method_name, strerror(rc), rc );
            mon_log_write(MON_PROCESS_CREATE_5, SQ_LOG_ERR, la_buf);
        }

        // check if process already exists with the same pid.
        if (MyNode->GetProcess(os_pid) != NULL)
        {
            rc = result = MPI_ERR_SPAWN;
            // tell the child to go away
            childGoAway = true;
            snprintf(la_buf, sizeof(la_buf),
                  "[%s], pid already exists, aborting process create: pid = %d\n",
                  method_name, os_pid );
            mon_log_write(MON_PROCESS_CREATE_4, SQ_LOG_ERR, la_buf);
        }

        // tell the child to stay or go away
        close(pipefd[0]); // close the read-end of the pipe, not going to use
        write(pipefd[1], &childGoAway, sizeof(childGoAway));
        close(pipefd[1]); // close the write-end of the pipe, sending EOF.

        if (childGoAway)
        {   // no need to continue connecting with child
            goto forkExit;
        }

        // I'm the monitor ... connect to child
        rc = MPI_SUCCESS;

        // Save process id and build process name if not already named
        Pid = os_pid;
        if (Name[0] == '\0')
        {   // No name assigned to the process so generate one based on
            // the node-id and process-id.
            MyNode->BuildOurName(Nid, os_pid, Name);

            if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                trace_printf("%s@%d - No process name specified, generated name=%s for process (%d, %d)\n", method_name, __LINE__, Name, Nid, os_pid);
        }

        if (NameServerEnabled && tag != NULL)
        {
            // Send actual pid and process name back to parent
            // STDIO Redirection requires that clone process in parent node
            // have the actual pid
            rc2 = PtpClient->ProcessInit( this
                                        , tag
                                        , 0
                                        , parent->Nid );
            if (rc2)
            {
                char la_buf[MON_STRING_BUF_SIZE];
                CLNode *parentLNode = NULL;
                parentLNode = Nodes->GetLNode( parent->Nid );
                snprintf( la_buf, sizeof(la_buf)
                        , "[%s] - Can't send process create success "
                          "for process %s (%d, %d) "
                          "to parent node %s, nid=%d\n"
                        , method_name
                        , GetName()
                        , GetNid()
                        , GetPid()
                        , parentLNode->GetNode()->GetName()
                        , parentLNode->GetNid() );
                mon_log_write(MON_PROCESS_CREATE_12, SQ_LOG_ERR, la_buf);
            }
        }

        if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION))
            trace_printf("%s@%d Process=%s, Infile=[%s], Outfile=[%s]\n",
                         method_name, __LINE__, Name, infile_.c_str(),
                         outfile_.c_str());

        // stdin pipe to child:
        //    We don't need read end of pipe.
        //    Add the write end of file descriptor to list of file
        //       descriptors monitored.
        if (pfds_stdin[1] != -1)
        {
            close(pfds_stdin[0]);

            // Decide on standard input source for the
            // process.  It will either be a filename on this node
            // or handled by a specific process on another node.
            int AncestorNid = -1;
            int AncestorPid = -1;
            char Stdfile[MAX_PROCESS_PATH];
            if (PickStdfile(PICK_STDIN, Stdfile, AncestorNid, AncestorPid))
            {
                Redirector.stdinFd(Nid, os_pid, pfds_stdin[1], Stdfile,
                                   AncestorNid, AncestorPid);
                fd_stdin_ = pfds_stdin[1];
            }
            else
            {
                if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION))
                    trace_printf("%s@%d Unable to find stdin file for "
                                 "Process=%s, pid=%d.  Closing stdin pipe "
                                 "fd=%d\n", method_name, __LINE__, Name,
                                 os_pid, pfds_stdin[1]);
                close ( pfds_stdin[1] );
            }
        }

        // stdout pipe to child:
        //    We don't need write end of pipe.
        //    Add the read end of file descriptor to list of file
        //       descriptors monitored.
        if (pfds_stdout[0] != -1)
        {
            close(pfds_stdout[1]);

            // Decide on standard output destination for the
            // process.  It will either be a filename on this node
            // or handled by a specific process on another node.
            int AncestorNid = -1;
            int AncestorPid = -1;
            char Stdfile[MAX_PROCESS_PATH];
            if (!PickStdfile(PICK_STDOUT, Stdfile, AncestorNid, AncestorPid))
            {  // Unable to locate stdout file.  So create a file based
               // on the process name and use that for output.
                strcpy(Stdfile, "stdout_");
                strcat(Stdfile, Name);
                if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION))
                    trace_printf("%s@%d Unable to find stdout file for "
                             "process=%s, pid=%d.  Using file %s for stdout.\n",
                                 method_name, __LINE__, Name, os_pid, Stdfile);
            }
            Redirector.stdoutFd(Nid, os_pid, pfds_stdout[0], Stdfile,
                                AncestorNid, AncestorPid);

            fd_stdout_ = pfds_stdout[0];
        }

        // stderr pipe to child:
        //    We don't need write end of pipe.
        //    Add the read end of file descriptor to list of file
        //       descriptors monitored.
        if (pfds_stderr[0] != -1)
        {
            close(pfds_stderr[1]);
            Redirector.stderrFd(MyNode->GetHostname(), Name, Nid, os_pid, pfds_stderr[0]);
            fd_stderr_ = pfds_stderr[0];
        }

    forkExit:
        // release fork semaphore so child can get it
        if ( sem_post(MyNode->GetMutex()) == -1 )
        {
            snprintf(la_buf, sizeof(la_buf),
                     "[CProcess::Create], Parent can't put mutex.\n");
            mon_log_write(MON_PROCESS_CREATE_7, SQ_LOG_ERR, la_buf);
        }
        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
           trace_printf("%s@%d - Parent put mutex so child can proceed.\n",
                        method_name, __LINE__);

#ifdef USE_FORK_SUSPEND_RESUME
        mon_thread_resume_suspended();
#endif // USE_FORK_SUSPEND_RESUME

    }
// LCOV_EXCL_START
// Exclude the following from monitor code coverage measurement since
// it is executed by a child process not the monitor process.
    else
    {
        // I'm the child process

        // Take fork semaphore.  We need to wait until parent indicates
        // it is ok to proceed.  Pipes between parent and child need to
        // be set up before child can continue.
        bool sem_log_error = false;
        int sem_rc;
        int err = 0;
        struct timeval logTime;
        struct tm *ltime;
        struct timespec ts;

        if (clock_gettime(CLOCK_REALTIME, &ts) == -1)
        {
            err = errno;
            gettimeofday(&logTime, NULL);
            ltime = localtime(&logTime.tv_sec);
            snprintf(la_buf, sizeof(la_buf),
                     "%02d/%02d/%02d-%02d:%02d:%02d "
                     "[CProcess::Create], clock_gettime(CLOCK_REALTIME),"
                     " Child can't get time, %s (%d), program %s, (pid=%d).\n"
                     , ltime->tm_mon+1, ltime->tm_mday, ltime->tm_year-100, ltime->tm_hour, ltime->tm_min, ltime->tm_sec
                     , strerror(err), err
                     , filename, getpid());
            write (2, la_buf, strlen(la_buf));
        }
        ts.tv_sec  += 1;

        env = getenv( "MON_CREATE_SEM_DELAY" );
        if (env && isdigit(*env))
        {
            ts.tv_sec = atol(env);
        }

        env = getenv( "MON_CREATE_SEM_LOG_ERROR" );
        if (env && isdigit(*env))
        {
            int val = atoi(env);
            sem_log_error = (val != 0) ? true : false;
        }
        do
        {
            sem_rc = sem_timedwait(MyNode->GetMutex(), &ts);
            err = errno;
            if ( sem_log_error && err == ETIMEDOUT )
            {
                gettimeofday(&logTime, NULL);
                ltime = localtime(&logTime.tv_sec);
                snprintf(la_buf, sizeof(la_buf),
                         "%02d/%02d/%02d-%02d:%02d:%02d "
                         "[CProcess::Create], Child can't take mutex,"
                         " %s (%d), program %s, (pid=%d).\n"
                         , ltime->tm_mon+1, ltime->tm_mday, ltime->tm_year-100, ltime->tm_hour, ltime->tm_min, ltime->tm_sec
                         , strerror(err), err
                         , filename, getpid());
                write (2, la_buf, strlen(la_buf));
            }
        }
        while (sem_rc == -1 && (err == EINTR || err == ETIMEDOUT));

        if ( sem_log_error && sem_rc == -1 && !(err == EINTR || err == ETIMEDOUT))
        {
            gettimeofday(&logTime, NULL);
            ltime = localtime(&logTime.tv_sec);
            snprintf(la_buf, sizeof(la_buf),
                     "%02d/%02d/%02d-%02d:%02d:%02d "
                     "[CProcess::Create], Child can't take mutex,"
                     " %s (%d), program %s, (pid=%d).\n"
                     , ltime->tm_mon+1, ltime->tm_mday, ltime->tm_year-100, ltime->tm_hour, ltime->tm_min, ltime->tm_sec
                     , strerror(errno), errno
                     , filename, getpid());
            write (2, la_buf, strlen(la_buf));
        }

        // check if monitor wanted child to stay or go away
        close(pipefd[1]); // close the write-end, not going to use
        // read till EOF
        while (read(pipefd[0], &childGoAway, sizeof(childGoAway)) > 0);
        close(pipefd[0]); // close the read-end of the pipe

        if (childGoAway)
        {
            _exit( ENOEXEC );
        }

        // set the process's process id to the os process id for compatability
        pid_t myPid = getpid();
        sprintf (argv[j + 3], "%6.6d", myPid);

        char *pName = argv[j + 4];
        if (pName[0] == '\0')
        {   // No name assigned to the process so generate one based on
            // the node-id and process-id.
            MyNode->BuildOurName(Nid, myPid, pName);
        }

        // Unmask all allowed signals in the child
        // except SIGUSR1
        sigset_t              mask;
        sigemptyset(&mask);
        sigaddset(&mask, SIGUSR1);
        rc = pthread_sigmask(SIG_SETMASK, &mask, NULL);
        if (rc != 0)
        {
            snprintf(la_buf, sizeof(la_buf),
                     "[CProcess::Create], pthread_sigmask() error:"
                     " %s (%d), program %s.\n", strerror(rc), rc, filename);
            write (2, la_buf, strlen(la_buf));
        }

        // set child process's priority based on minimums and specified value
        nice(priority);

        // Redirect standard input, standard output, standard error
        RedirectStdFiles(pfds_stdin, pfds_stdout, pfds_stderr);

        // Close file descriptors opened by the monitor parent except
        // for stdin, stdout, stderr.
        MyNode->close_fds ();

        char *name;
        size_t pathlen;

        // Get program search path
        pathlen = path.length();

        size_t len;
        len = strlen(filename) + 1;

        // Allocate space to hold the pathnames + filename
        size_t alloclen;
        alloclen = pathlen + len + 1;
        name = new char[alloclen];

        // Place the program filename at the end of the buffer preceeded
        // by a slash.
        name =  (char *) memcpy(name + pathlen + 1, filename, len);
        *--name = '/';

        // Try to find the program in the directories specified by PATH.
        // Each element of the path is tried until we find the program
        // or run out of elements to try.
        const char *pEnd;
        const char *pStart;

        pEnd = path.c_str();
        do
        {
            char *startp;

            pStart = pEnd;
            pEnd = strchr(pStart, ':');
            if (!pEnd)
                pEnd = strchr(pStart, '\0');

            if (pEnd == pStart)
                // Two adjacent colons, or a colon at the beginning or the end
                // of `PATH' means to search the current directory.
                startp = name + 1;
            else
                // Copy the next path into the buffer just before the
                // program filename.
                startp = (char *) memcpy(name - (pEnd - pStart), pStart, pEnd - pStart);

            // Try to execute this name.  If it works, execve will not return.
            execve( startp, argv, childEnv);

            switch  (errno)
            {
            case EACCES:
            case ENOENT:
            case ESTALE:
            case ENOTDIR:
            case ENODEV:
            case ETIMEDOUT:
            case ENOEXEC:
                // Those errors indicate the file is missing or not
                // executable by us, in which case we want to just try
                // the next path directory.
                break;

            default:
                // Some other error means we found an executable file, but
                // something went wrong executing it; return the error to
                // our caller.
                goto execFailed;
            }
        }  while (*pEnd++ != '\0');

    execFailed:
        // The specified program could not be executed.  Note that at this
        // point we are executing as the child process.  We will exit and
        // the monitor will get a "child death" signal and take the
        // appropriate actions.
        //
        // It's probably not possible to log an error at this point
        // since the error logging mechanism is probably not available
        // at this early stage of child process startup.  We can write to
        // the standard error file descriptor since the monitor has set that
        // up as a pipe back to itself.

        snprintf(la_buf, sizeof(la_buf),
                 "Unable to execute program %s, %s (%d).\n",
                 filename, strerror(errno), errno);
        write (2, la_buf, strlen(la_buf));

        _exit( errno );
    }
// LCOV_EXCL_STOP

    MemModLock.unlock();

    if (rc == MPI_SUCCESS && result == MPI_SUCCESS)
    {
        successful = true;
        PidAtFork_  = os_pid;

        // Indicate that process exists but has not yet completed initialization.
        State_ = State_Initializing;

        MyNode->SetAffinity( this );

        if (Backup)
        {
            if ( !parent )
            {   // Unexpectedly have null parent pointer
                snprintf(la_buf, sizeof(la_buf),
                         "[CProcess::CProcess], No Primary for Backup process!\n");
                mon_log_write(MON_PROCESS_PROCESS_2, SQ_LOG_ERR, la_buf);
            }
            else if (strcmp (parent->Name, Name) != 0)
            {
                snprintf(la_buf, sizeof(la_buf),
                         "[CProcess::Create], Primary & Backup process name "
                         "don't match!\n");
                mon_log_write(MON_PROCESS_CREATE_10, SQ_LOG_ERR, la_buf);
            }
            else
            {
                // primary & backup processes are parent's of each other
                parent->Parent_Nid = Nid;
                parent->Parent_Pid = Pid;
                parent->Backup = false;
                if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                   trace_printf("%s@%d" " - Assigning parent nid=" "%d" ", pid=" "%d"  " with (really child) parent nid=" "%d" ", parent pid=" "%d" "\n", method_name, __LINE__, parent->Nid, parent->Pid, Nid, Pid);
            }
        }

        if ( Backup )
        {   // For a backup process the "parent" is the CProcess object
            // for the primary process.  So find the real parent process
            // object.
            parent = Nodes->GetLNode ( PairParentNid )
                          ->GetProcessL( PairParentPid );
        }

        if ( !UnHooked &&  parent  && !Backup )
        {   // Parent process object keeps track of child processes
            // created on this node.  Needed in case parent process
            // exits abnormally.

            if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL
                                  | TRACE_PROCESS_DETAIL))
                trace_printf("%s@%d - Child process %s (%d, %d) added to "
                             "parent %s (%d, %d)\n", method_name, __LINE__,
                             Name, Nid, Pid, parent->GetName(),
                             parent->GetNid(), parent->GetPid());

            parent->childAdd ( Nid, Pid );

        }

        Monitor->writeProcessMapBegin( Name, Nid, Pid, Verifier,
                                       parent ? parent->GetNid() : -1,
                                       parent ? parent->GetPid() : -1,
                                       parent ? parent->GetVerifier() : -1,
                                       program.c_str() );
    }
    else
    {
        successful = false;
        result = MPI_ERR_SPAWN;

        if (NameServerEnabled)
        {
            rc2 = PtpClient->ProcessInit( this
                                        , tag
                                        , result
                                        , parent->Nid );
            if (rc2)
            {
                char la_buf[MON_STRING_BUF_SIZE];
                CLNode *parentLNode = NULL;
                parentLNode = Nodes->GetLNode( parent->Nid );
                snprintf( la_buf, sizeof(la_buf)
                        , "[%s] - Can't send process create failure "
                          "for process %s (%d, %d) "
                          "result to parent node %s, nid=%d, result=%d\n"
                        , method_name
                        , GetName()
                        , GetNid()
                        , GetPid()
                        , parentLNode->GetNode()->GetName()
                        , parentLNode->GetNid(), result );
                mon_log_write(MON_PROCESS_CREATE_13, SQ_LOG_ERR, la_buf);
            }
        }

        char buf[MON_STRING_BUF_SIZE];
        snprintf(buf, sizeof(buf), "[CProcess::Create], Failed to start process %s path= %s.\n", Name, path.c_str());
        mon_log_write(MON_PROCESS_CREATE_11, SQ_LOG_ERR, buf);
    }

    // release allocated memory
    for (i = 0; argv[i]; i++)
    {
        delete [] argv[i];
    }
    delete [] argv;

    for (i = 0; childEnv[i]; i++)
    {
        delete [] childEnv[i];
    }

    TRACE_EXIT;

    return successful;
}
#endif

#ifndef NAMESERVER_PROCESS
bool CProcess::Dump (CProcess *dumper, char *core_path)
{
    bool status = FAILURE;
    CReplDump *repl;

    const char method_name[] = "CProcess::Dump";
    TRACE_ENTRY;

    switch (DumpState)
    {
        case Dump_Ready:
            DumpState = Dump_Pending;
            dumpFile_ = core_path;
            DumperNid = dumper->Nid;
            DumperPid = dumper->Pid;
            DumperVerifier = dumper->Verifier;
            status = SUCCESS;
            if (trace_settings & TRACE_PROCESS)
                trace_printf("%s@%d - DumpState=Dump_Pending, pid=%d\n",
                             method_name, __LINE__, Pid);
            repl = new CReplDump(this);
            Replicator.addItem(repl);
            break;

        default:
            if (trace_settings & TRACE_PROCESS)
                trace_printf("%s@%d - Dump already in progress, pid=%d\n",
                             method_name, __LINE__, Pid);
            break;
    }

    TRACE_EXIT;

    return status;
}
#endif

#ifndef NAMESERVER_PROCESS
static void cprocess_dump_cb(void *ctx, pid_t pid, int status)
{
    CLNode   *lnode = static_cast<CLNode *>(ctx);
    lnode->DumpCallback( lnode->GetNid(), pid, status );
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcess::DumpBegin (int nid, int pid, Verifier_t verifier, char *core_path)
{
    char           *argv[6];
    char           *cmd;
    char            core_file[MAX_PROCESS_PATH];
    char            core_pid[20];
    char            date[20];
    int             err;
    struct timeval  tv;
    struct tm       tx;

    const char method_name[] = "CProcess::DumpBegin";
    TRACE_ENTRY;

    DumperNid = nid;
    DumperPid = pid;
    DumperVerifier = verifier;
    if (Clone)
    {
        DumpState = Dump_InProgress;
    }
    else
    {
        // Increment reference count for process object until DumpEnd
        incrReplRef();

        gettimeofday(&tv, NULL);
        localtime_r(&tv.tv_sec, &tx);
        snprintf(date, sizeof(date), "%d-%02d-%02d_%02d-%02d-%02d",
                tx.tm_year + 1900,
                tx.tm_mon + 1,
                tx.tm_mday,
                tx.tm_hour,
                tx.tm_min,
                tx.tm_sec);

        string program;
        Config->strIdToString ( programStrId_, program );

        cmd = rindex((char *) program.c_str(), '/');
        if (cmd == NULL)
            cmd = (char *) program.c_str();
        else
            cmd++; // past '/'
        // date=%Y-%m-%d_%H-%M-%S
        // core_file=<path>/core.<date>.<pname>.<pid>.<cmd>
        snprintf(core_file, sizeof(core_file), "%s/core.%s.%s.%d.%s",
                core_path,
                date,
                &Name[1],
                Pid,
                cmd);
        corefile_ = core_file;

        if (trace_settings & TRACE_PROCESS)
            trace_printf("%s@%d - starting mondump for pid=%d, core-file=%s\n",
                         method_name, __LINE__, Pid, core_file);

        argv[0] = (char *) "mondump";
        snprintf(core_pid, sizeof(core_pid), "%d", Pid);
        argv[1] = core_pid;
        argv[2] = core_file;
        if ((nid == Nid) || getenv("SQ_VIRTUAL_NODES"))
           argv[3] = NULL;
        else
        {
           argv[3] = (char *) Nodes->GetNode(Nid)->GetName();
           argv[4] = getenv("MPI_TMPDIR");
           argv[5] = NULL;
        }
        CLNode   *lnode = Nodes->GetLNode( Nid );
        err = IntProcess.create(argv[0],
                                argv,
                                cprocess_dump_cb, // cb
                                Pid,              // cb_pid
                                lnode,            // cb_ctx
                                NULL);
        if (err == 0)
        {
            dumpFile_ = core_file;
            DumpState = Dump_InProgress;
        }
        else
        {
            DumpState = Dump_Complete;
            CReplDumpComplete *repl = new CReplDumpComplete(this);
            Replicator.addItem(repl);
            CompleteDump(Dump_Failed, NULL);
        }
    }

    if (trace_settings & TRACE_PROCESS)
    {
        if (DumpState == Dump_InProgress)
            trace_printf("%s@%d - DumpState=Dump_InProgress, pid=%d\n",
                         method_name, __LINE__, Pid);
        else
            trace_printf("%s@%d - DumpState=Dump_Complete, pid=%d\n",
                         method_name, __LINE__, Pid);
    }

    TRACE_EXIT;
}
#endif

#ifndef NAMESERVER_PROCESS
const char *DumpStateString( DUMPSTATE state)
{
    const char *str;

    switch( state )
    {
        case Dump_Unknown:
            str = "Dump_Unknown";
            break;
        case Dump_Ready:
            str = "Dump_Ready";
            break;
        case Dump_Pending:
            str = "Dump_Pending";
            break;
        case Dump_InProgress:
            str = "Dump_InProgress";
            break;
        case Dump_Complete:
            str = "Dump_Complete";
            break;
        default:
            str = "DumpState - Undefined";
            break;
    }

    return( str );
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcess::DumpEnd (DUMPSTATUS status, char *core_file)
{
    const char method_name[] = "CProcess::DumpEnd";
    TRACE_ENTRY;

    if (trace_settings & TRACE_PROCESS)
        trace_printf("%s@%d - name=%s, DumpState=%s, DumpStatus=%d, pid=%d, core_file=%s\n",
                     method_name, __LINE__, Name, DumpStateString(DumpState), status, Pid, core_file);

    if ( DumpState != Dump_Ready )
    {
        CompleteDump(status, core_file);
    }

    // Decrement reference count for process object
    decrReplRef();

    TRACE_EXIT;
}
#endif

#ifndef NAMESERVER_PROCESS
struct message_def * CProcess::DeathMessage( )
{
    struct message_def *msg;

    const char method_name[] = "CProcess::DeathMessage";
    TRACE_ENTRY;

    // Record statistics (sonar counters)
    if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
       MonStats->notice_death_Incr();

    msg = new struct message_def;
    msg->type = MsgType_ProcessDeath;
    msg->noreply = true;
    msg->u.request.type = ReqType_Notice;
    msg->u.request.u.death.nid = Nid;
    msg->u.request.u.death.pid = Pid;
    msg->u.request.u.death.verifier = Verifier;
    msg->u.request.u.death.trans_id.txid[0] = 0;
    msg->u.request.u.death.trans_id.txid[1] = 0;
    msg->u.request.u.death.trans_id.txid[2] = 0;
    msg->u.request.u.death.trans_id.txid[3] = 0;
    msg->u.request.u.death.aborted = IsAbended();
    strcpy(msg->u.request.u.death.process_name, Name);
    msg->u.request.u.death.type = Type;
#ifdef USE_SEQUENCE_NUM
    msg->u.request.u.death.seqnum = Monitor->GetTimeSeqNum();
#endif

    if (trace_settings & ( TRACE_TMSYNC | TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL))
        trace_printf("%s@%d - Death notice for process %s (%d, %d)\n",
                     method_name, __LINE__, Name, Nid, Pid );
    TRACE_EXIT;

    return msg;
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcess::Exit( CProcess *parent )
{
    char la_buf[MON_STRING_BUF_SIZE];

    const char method_name[] = "CProcess::Exit";
    TRACE_ENTRY;

    if ( DumpState != Dump_Ready )
    {
        DumpEnd( Dump_Failed, (char *)corefile_.c_str() );
    }

    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
       trace_printf( "%s@%d" " - Process %s (%d,%d:%d) is exiting, parent process %s (%d,%d:%d)\n"
                   , method_name, __LINE__
                   , GetName(), GetNid(), GetPid(), GetVerifier()
                   , parent?parent->GetName():""
                   , parent?parent->GetNid():-1
                   , parent?parent->GetPid():-1
                   , parent?parent->GetVerifier():-1 );

    SetState(State_Stopped);

    if (!Clone && parent && NameServerEnabled)
    {
        if (parent->GetNid() != GetNid())
        { // parent is remote
            if (parent->childCount() == 0)
            { // process is parent's last child
                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
                {
                    trace_printf( "%s@%d - Parent's last child, deleting clone process %s, (%d,%d:%d)\n"
                                , method_name, __LINE__
                                , parent->GetName()
                                , parent->GetNid()
                                , parent->GetPid()
                                , parent->GetVerifier() );
                }
                Nodes->DeleteCloneProcess( parent );
                parent = NULL;
            }
            else
            {
                ProcessInfoNs_reply_def processInfo;
                int rc = Nodes->GetProcessInfoNs( parent->GetNid()
                                                , parent->GetPid()
                                                , parent->GetVerifier()
                                                , &processInfo);
                if (rc == MPI_ERR_NAME)
                { // parent exited
                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
                    {
                        trace_printf( "%s@%d - Deleting clone process %s, (%d,%d:%d)\n"
                                    , method_name, __LINE__
                                    , parent->GetName()
                                    , parent->GetNid()
                                    , parent->GetPid()
                                    , parent->GetVerifier() );
                    }
                    Nodes->DeleteCloneProcess( parent );
                    parent = NULL;
                }
            }
        }
    }

    // if the env is set to not deliver death messages upon node down,
    // check the state of the process' node.
    bool supplyProcessDeathNotices = true;
    if (!Monitor->IsNodeDownDeathNotices())
    {
        CNode * node = Nodes->GetLNode(GetNid())->GetNode();
        // if process' node is being killed, do not supply process death notices
        supplyProcessDeathNotices = node->IsSoftNodeDown()
                                        ? node->IsSoftNodeDown()
                                        : !node->IsKillingNode();
    }

    if(  NoticeHead &&
        !MyNode->IsKillingNode() &&
        !(Type == ProcessType_DTM && IsAbended()) &&
        supplyProcessDeathNotices )
    {
        if ( !Clone && NameServerEnabled )
        {
            // Notify all remote registered nodes of this process' death
            NoticeHead->NotifyRemote();
        }
        // Notify all local registered processes of this process' death
        NoticeHead->NotifyAll();
    }

    if ( !Clone && !Paired )
    {
        switch (Type)
        {
            case ProcessType_TSE:
            case ProcessType_ASE:
                MyNode->delFromQuiesceExitPids( GetPid(), GetVerifier() );

                if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL))
                    trace_printf("%s%d: pid %d deleted from quiesce exit list\n", method_name, __LINE__, GetPid());

                if (MyNode->isInQuiesceState())
                {
                    if (MyNode->isQuiesceExitPidsEmpty())
                    {
                        HealthCheck.setState(MON_SCHED_NODE_DOWN);  // schedule a node down req
                    }
                }
                else
                {   // unmount volumes only if node is not quiescing.
                    Devices->UnMountVolume( Name, Backup );
                }
                break;
            case ProcessType_DTM:
                if ( IsAbended() )
                {
                    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                       trace_printf( "%s@%d - DTM abended %s (%d, %d:%d)\n"
                                   , method_name, __LINE__, Name, Nid, Pid, Verifier);
                    if ( !MyNode->IsKillingNode() &&
                         !IsPersistent() &&
                          MyNode->GetShutdownLevel() != ShutdownLevel_Abrupt )
                    {
                        MyNode->SetDTMAborted( true );
                    }
                }
                else
                {
                    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                       trace_printf("%s@%d" " - DTM stopped normally" "\n", method_name, __LINE__);
                    if ( !MyNode->IsKillingNode() &&
                         !IsPersistent() &&
                          MyNode->GetShutdownLevel() == ShutdownLevel_Undefined )
                    {
                        MyNode->SetDTMAborted( true );
                    }
                    else
                    {
                        if ( Monitor->GetTmLeader() == MyPNID )
                        {
                            // set the clean shutdown condition
                            char key[MAX_KEY_NAME];
                            char value[10];
                            strcpy(key,"Clean_Shutdown");
                            strcpy(value,"True");
                            Config->GetClusterGroup()->Set( key, value );
                        }
                    }
                }
                break;
            case ProcessType_SMS:
                if ( IsAbended() )
                {
                    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                       trace_printf( "%s@%d - SMS abended %s (%d, %d:%d)\n"
                                   , method_name, __LINE__, Name, Nid, Pid, Verifier);
                    if ( !MyNode->IsKillingNode() &&
                          MyNode->GetShutdownLevel() != ShutdownLevel_Abrupt )
                    {
                        MyNode->SetSMSAborted( true );
                    }
                }
                else
                {
                    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                       trace_printf("%s@%d" " - SMS stopped normally" "\n", method_name, __LINE__);
                    if ( !MyNode->IsKillingNode() &&
                          MyNode->GetShutdownLevel() == ShutdownLevel_Undefined )
                    {
                        MyNode->SetSMSAborted( true );
                    }
                }
                break;
            case ProcessType_NameServer:
                if ( IsAbended() )
                {
                    if (!Clone)
                    {
                        NameServer->NameServerExited();
                    }
                    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                       trace_printf("%s@%d" " - NameServer abended" "\n", method_name, __LINE__);
                }
                else
                {
                    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                       trace_printf("%s@%d" " - NameServer stopped normally" "\n", method_name, __LINE__);
                }
                break;
            case ProcessType_Watchdog:
                if ( IsAbended() )
                {
                    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                       trace_printf("%s@%d" " - Watchdog abended" "\n", method_name, __LINE__);
                }
                else
                {
                    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                       trace_printf("%s@%d" " - Watchdog stopped normally" "\n", method_name, __LINE__);
                }
                break;
            case ProcessType_MXOSRVR:
            case ProcessType_Generic:
                if ( MyNode->GetState() == State_Up &&
                    !MyNode->IsKillingNode() &&
                     MyNode->GetShutdownLevel() == ShutdownLevel_Undefined )
                {
                    // Send logical node's SSMP process this process' death message
                    CLNode *lnode = MyNode->GetLNode( Nid );
                    if ( lnode )
                    {
                        CProcess *ssmpProcess = lnode->GetSSMProc();
                        if ( ssmpProcess && Pid != -1 )
                        {
                            if (trace_settings & TRACE_PROCESS)
                                trace_printf("%s@%d: Queueing death notice for SSMP process for %s (%d, %d:%d)\n",
                                             method_name, __LINE__, Name, Nid, Pid, Verifier);

                            ssmpProcess->ssmpNoticesLock_.lock();
                            ssmpProcess->ssmpNotices_.push_back( DeathMessage() );
                            ssmpProcess->ssmpNoticesLock_.unlock();
                            SQ_theLocalIOToClient->nudgeNotifier ();
                        }
                        else
                        {
                            if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL | TRACE_NOTICE ))
                                trace_printf("%s@%d: No SSMP process found in nid=%d\n",
                                             method_name, __LINE__, lnode->GetNid());
                        }
                    }
                }
                break;

            case ProcessType_SSMP:
                // Indicate no SSM process on this node.
                Nodes->GetLNode ( Nid )->SetSSMProc ( NULL );
                break;

            case ProcessType_AMP:
            case ProcessType_Backout:
            case ProcessType_VolumeRecovery:
            case ProcessType_SPX:
            case ProcessType_PSD:
            case ProcessType_PERSIST:
                // No special handling needed on exit
                break;
            default:

                snprintf(la_buf, sizeof(la_buf),
                         "[CProcess::Exit], Invalid process type!\n");
                mon_log_write(MON_PROCESS_EXIT_1, SQ_LOG_ERR, la_buf);
        }

        // Remove this child process from parent's child-process-list.
        if ( (parent != NULL) && (parent->GetState() == State_Up) )
        {
            parent->childRemove( Nid, Pid);
            if (NameServerEnabled)
            {
                parent->childUnHookedRemove( Nid, Pid);
            }
        }

        // Check if we need to output a entry into the process id map log file
        if ( PidMap )
        {
            Monitor->writeProcessMapEnd( Name, Nid, Pid, Verifier,
                                         parent ? parent->GetNid() : -1,
                                         parent ? parent->GetPid() : -1,
                                         parent ? parent->GetVerifier() : -1,
                                         program() );
        }
    }

    if ( Clone && Pid != -1 )
    {
        if ( Type == ProcessType_SPX &&
             MyNode->GetShutdownLevel() == ShutdownLevel_Undefined &&
             supplyProcessDeathNotices )
        {
            // Send local SPX this SPX's death message
            CLNode *lnode = MyNode->GetFirstLNode();
            for ( ; lnode; lnode = lnode->GetNextP() )
            {
                CProcess *spxProcess = lnode->GetProcessLByType( ProcessType_SPX );
                if ( spxProcess && MyNode->GetState() == State_Up )
                {
                    SQ_theLocalIOToClient->putOnNoticeQueue( spxProcess->Pid
                                                           , spxProcess->Verifier
                                                           , DeathMessage()
                                                           , NULL);

                    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                       trace_printf( "%s@%d" " - Sending death message of %s (%d,%d:%d) to %s (%d,%d:%d)\n"
                                   , method_name, __LINE__
                                   , GetName(), GetNid(), GetPid(), GetVerifier()
                                   , spxProcess->GetName(), spxProcess->GetNid()
                                   , spxProcess->GetPid(), spxProcess->GetVerifier());
                }
                else
                {
                    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL))
                        trace_printf("%s@%d: No SPX process found in nid=%d\n",
                                     method_name, __LINE__, lnode->GetNid());
                }
            }
        }

        if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL))
            trace_printf( "%s@%d" " - Death message check of %s (%d,%d:%d) type=%s, node phase=%s, send death notices=%d\n"
                        , method_name, __LINE__
                        , GetName(), GetNid(), GetPid(), GetVerifier()
                        , ProcessTypeString(GetType()), NodePhaseString( MyNode->GetPhase() )
                        , supplyProcessDeathNotices );

        if ( Type == ProcessType_DTM &&
             MyNode->GetPhase() == Phase_Ready &&
             supplyProcessDeathNotices )
        {
            // Send local DTMs this DTM's death message
            CLNode *lnode = MyNode->GetFirstLNode();
            for ( ; lnode; lnode = lnode->GetNextP() )
            {
                CProcess *tmProcess = lnode->GetProcessLByType( ProcessType_DTM );
                if ( tmProcess && MyNode->GetState() == State_Up )
                {
                    SQ_theLocalIOToClient->putOnNoticeQueue( tmProcess->Pid
                                                           , tmProcess->Verifier
                                                           , DeathMessage()
                                                           , NULL);

                    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                       trace_printf( "%s@%d" " - Sending death message of %s (%d,%d:%d) to %s (%d,%d:%d)\n"
                                   , method_name, __LINE__
                                   , GetName(), GetNid(), GetPid(), GetVerifier()
                                   , tmProcess->GetName(), tmProcess->GetNid()
                                   , tmProcess->GetPid(), tmProcess->GetVerifier());

                }
                else
                {
                    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL))
                        trace_printf("%s@%d: No DTM process found in nid=%d\n",
                                     method_name, __LINE__, lnode->GetNid());
                }
            }
        }
    }

    if ( parent && !parent->IsClone() && Pid != -1 )
    {

        // If process and parent are DTMs suppress death
        // message here, it was delivered above
        if ( parent->IsSystemMessages()   &&
             parent->GetState() == State_Up &&
             !MyNode->IsKillingNode() &&
             !(GetType() == ProcessType_DTM &&
               parent->GetType()  == ProcessType_DTM) &&
             supplyProcessDeathNotices )
        {
            SQ_theLocalIOToClient->putOnNoticeQueue( parent->Pid
                                                   , parent->Verifier
                                                   , DeathMessage()
                                                   , NULL);

            if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
               trace_printf( "%s@%d" " - Sending death message of %s (%d,%d:%d) to %s (%d,%d:%d) \n"
                           , method_name, __LINE__
                           , GetName(), GetNid(), GetPid(), GetVerifier()
                           , parent->GetName(), parent->GetNid()
                           , parent->GetPid(), parent->GetVerifier());
        }
        else
        {
            if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
               trace_printf("%s@%d" " - Parent doesn't want Death message" "\n", method_name, __LINE__);
        }
    }

    if (NameServerEnabled)
    {
        if ( parent )
        {
            if ( parent->IsClone() && Pid != -1 )
            {
                int targetNid = parent->GetNid();
                CLNode *targetLNode = Nodes->GetLNode( targetNid );
                // Send the process exit to the parent node
                int rc = PtpClient->ProcessExit( this
                                               , targetNid
                                               , targetLNode->GetNode()->GetName() );
                if (rc)
                {
                    char la_buf[MON_STRING_BUF_SIZE];
                    snprintf( la_buf, sizeof(la_buf)
                            , "[%s] - Can't send process exit "
                              "for process %s (%d, %d) "
                              "to parent node %s, nid=%d\n"
                            , method_name
                            , GetName()
                            , GetNid()
                            , GetPid()
                            , targetLNode->GetNode()->GetName()
                            , targetLNode->GetNid() );
                    mon_log_write(MON_PROCESS_PROCEXIT_1, SQ_LOG_ERR, la_buf);
                }
            }
        }
        else
        {
            if (GetParentNid() != -1)
            {
                int targetNid = GetParentNid();
                CLNode *targetLNode = Nodes->GetLNode( targetNid );
                // Send the process exit to the parent node
                int rc = PtpClient->ProcessExit( this
                                               , targetNid
                                               , targetLNode->GetNode()->GetName() );
                if (rc)
                {
                    char la_buf[MON_STRING_BUF_SIZE];
                    snprintf( la_buf, sizeof(la_buf)
                            , "[%s] - Can't send process exit "
                              "for process %s (%d, %d) "
                              "to parent node %s, nid=%d\n"
                            , method_name
                            , GetName()
                            , GetNid()
                            , GetPid()
                            , targetLNode->GetNode()->GetName()
                            , targetLNode->GetNid() );
                    mon_log_write(MON_PROCESS_PROCEXIT_2, SQ_LOG_ERR, la_buf);
                }
            }
        }
        procExitNotifierNodes();
    }

    TRACE_EXIT;
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcess::GenerateEvent( int event_id, int length, char *data )
{
    struct message_def *msg;

    const char method_name[] = "CProcess::GenerateEvent";
    TRACE_ENTRY;
    if( Clone )
    {
        if ( Event_messages )
        {
            // Replicate the event to other nodes
            CReplEvent *repl = new CReplEvent(event_id, length, data, Nid, Pid, Verifier);
            Replicator.addItem(repl);
        }
    }
    else
    {
        if ( Event_messages )
        {
            msg = new struct message_def;
            msg->type = MsgType_Event;
            msg->noreply = true;
            msg->u.request.type = ReqType_Notice;
            msg->u.request.u.event_notice.event_id = event_id;
            msg->u.request.u.event_notice.length = length;
            memset( msg->u.request.u.event_notice.data, 0, MAX_SYNC_DATA );
            if (length && data)
            {
                memmove( msg->u.request.u.event_notice.data, data, (length>MAX_SYNC_DATA)?MAX_SYNC_DATA:length );
            }

            SQ_theLocalIOToClient->putOnNoticeQueue( Pid
                                                   , Verifier
                                                   , msg
                                                   , NULL);
        }
    }
    TRACE_EXIT;
}
#endif

CProcess *CProcess::GetBackup (void)
{
    CLNode *node = NULL;
    CProcess *parent = NULL;
    CProcess *backup = NULL;

    node = Nodes->GetLNode (Parent_Nid);
    if (node)
    {
        parent = node->GetProcessL(Parent_Pid);
        if (parent)
        {
            backup = (parent->Backup ? parent : NULL);
        }
    }

    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
       trace_printf("CProcess::GetBackup" "@%d" " - name= %s(%d:%d), parent=%p(%s), backup=%p" "\n", __LINE__, Name, Parent_Nid, Parent_Pid, parent, parent ? parent->Name : "None", backup);

    return backup;
}


// see: CProcessContainer::GetProcess (int pid)
// see: CProcessContainer::GetProcess (char *name, bool checkstate)

CProcess *CProcess::GetProcessByType( PROCESSTYPE type )
{
    CProcess *entry = this;

    const char method_name[] = "CProcess::GetProcessByType";
    TRACE_ENTRY;
    while (entry)
    {
        if (entry->Type == type)
        {
            // Only return entry if it has completed startup
            if (entry->State_ != State_Up)
            {
                entry = NULL;
            }
            break;
        }
        entry = entry->next_;
    }
    TRACE_EXIT;
    return entry;
}

// see: CLNode::GetProcessL (int pid)
// see: CLNode::GetProcessL (char *name, bool checkstate)

CProcess *CProcess::GetProcessLByType( PROCESSTYPE type )
{
    CProcess *entry = this;

    const char method_name[] = "CProcess::GetProcessLByType";
    TRACE_ENTRY;
    while (entry)
    {
        if (entry->Type == type)
        {
            // Only return entry if it has completed startup
            if (entry->State_ != State_Up)
            {
                entry = NULL;
            }
            break;
        }
        entry = entry->nextL_;
    }
    TRACE_EXIT;
    return entry;
}

bool CProcess::MakePrimary (void)
{
    bool successful;
    CLNode *node = NULL;
    CProcess *primary = NULL;
    CProcess *backup = NULL;

    const char method_name[] = "CProcess::MakePrimary";
    TRACE_ENTRY;
    if (Backup)
    {
        backup = this;
        if (Parent_Nid != -1)
        {
            node = Nodes->GetLNode (Parent_Nid);
            if (node)
            {
                if (Parent_Pid != -1)
                {
                    primary = node->GetProcessL(Parent_Pid);
                    if (!primary)
                    {
                        if (trace_settings & TRACE_REQUEST_DETAIL)
                           trace_printf("%s@%d" " - Can't find Primary process" "\n", method_name, __LINE__);
                    }
                }
            }
            else
            {
                if (trace_settings & TRACE_REQUEST_DETAIL)
                   trace_printf("%s@%d" " - Can't find Primary process's node" "\n", method_name, __LINE__);
            }
        }
    }
    else
    {
        primary = this;
        if (Parent_Nid != -1)
        {
            node = Nodes->GetLNode (Parent_Nid);
            if (node)
            {
                if (Parent_Pid != -1)
                {
                    backup = node->GetProcessL(Parent_Pid);
                    if (backup)
                    {
                        backup = (backup->Backup ? backup : NULL);
                    }
                }
                else
                {
                    if (trace_settings & TRACE_REQUEST_DETAIL)
                       trace_printf("%s@%d" " - Can't find Backup process" "\n", method_name, __LINE__);
                }
            }
            else
            {
                if (trace_settings & TRACE_REQUEST_DETAIL)
                   trace_printf("%s@%d" " - Can't find Backup process's node" "\n", method_name, __LINE__);
            }
        }
    }

    if (primary == this)
    {
        if (trace_settings & TRACE_REQUEST_DETAIL)
           trace_printf("%s@%d" "- Primary process will continue as Primary" "\n", method_name, __LINE__);
        if (!backup)
        {
            primary->Parent_Nid = -1;
            primary->Parent_Pid = -1;
        }
        successful = true;
    }
    else if (backup == this)
    {
        backup->Backup = false;
        if (primary)
        {
            primary->Backup = true;
            if (trace_settings & TRACE_REQUEST_DETAIL)
               trace_printf("%s@%d" "- Old Primary process is now the Backup" "\n", method_name, __LINE__);
        }
        else
        {
            backup->Parent_Nid = -1;
            backup->Parent_Pid = -1;
        }
        if (trace_settings & TRACE_REQUEST_DETAIL)
           trace_printf("%s@%d" "- Backup process is now the Primary" "\n", method_name, __LINE__);
        successful = true;
    }
    else
    {
        successful = false;
    }

    TRACE_EXIT;
    return successful;
}

#ifndef NAMESERVER_PROCESS
bool CProcess::MyTransactions( struct message_def *msg )
{
    int idx;
    CNotice *notice = NoticeHead;

    const char method_name[] = "CProcess::MyTransactions";
    TRACE_ENTRY;

    while (notice)
    {
        if ( !isNull( notice->TransID ) )
        {
            idx = msg->u.reply.u.trans_info.num_processes;
            msg->u.reply.u.trans_info.procs[idx].nid = notice->Nid;
            msg->u.reply.u.trans_info.procs[idx].pid = notice->Pid;
            msg->u.reply.u.trans_info.procs[idx].trans_id = notice->TransID;
            msg->u.reply.u.trans_info.num_processes++;
            if (msg->u.reply.u.trans_info.num_processes >= MAX_PROC_LIST)
            {
                msg->u.reply.u.trans_info.return_code = MPI_ERR_TRUNCATE;
                return FAILURE;
            }
        }
        notice = notice->GetNext();
    }

    TRACE_EXIT;
    return SUCCESS;
}
#endif

#ifndef NAMESERVER_PROCESS
bool CProcess::Open (CProcess * opened_process, int death_notification)
{
    const char method_name[] = "CProcess::Open";
    TRACE_ENTRY;

    bool status;

    if ((opened_process->StartupCompleted) &&
        (opened_process->State_ == State_Up) && (State_ == State_Up))
    {
        if ( death_notification
             && !((opened_process->Parent_Nid == Nid) &&
                  (opened_process->Parent_Pid == Pid)) )
        {
            _TM_Txid_External transid;
            transid = null_trans();
            opened_process->RegisterDeathNotification( Nid
                                                     , Pid
                                                     , Verifier
                                                     , Name
                                                     , transid);
        }
        status = SUCCESS;
    }
    else
    {
        char buf[MON_STRING_BUF_SIZE];
        snprintf(buf, sizeof(buf), "[CProcess::Open], Can't Open Process %s "
                 "has not completed startup protocol!\n", opened_process->Name);
        mon_log_write(MON_PROCESS_OPEN_1, SQ_LOG_ERR, buf);

        status = FAILURE;
    }
    TRACE_EXIT;

    return status;
}
#endif

void CProcessContainer::close_fds ( void )
{
    DIR *dirp = opendir("/proc/self/fd");
    for (;;)
    {
        if (dirp == NULL)
            break;
        struct dirent *direntp = readdir(dirp);
        if (direntp == NULL)
            break;
        if (direntp->d_ino == 0) // invalid inode-number
            continue;
        if (direntp->d_name[0] == '.') // relative
            continue;
        int fd;
        sscanf(direntp->d_name, "%d", &fd);
        if (fd > 2)
            close(fd);
    }
    if (dirp != NULL)
        closedir(dirp);
}

#ifndef NAMESERVER_PROCESS
CNotice *CProcess::RegisterDeathNotification( int nid
                                            , int pid
                                            , Verifier_t verifier
                                            , const char *name
                                            , _TM_Txid_External trans_id )
{
    CNotice *notice = NULL;

    const char method_name[] = "CProcess::RegisterDeathNotification";
    TRACE_ENTRY;

    deathInterestLock_.lock();

    if ( NoticeHead )
    {
        notice = NoticeHead->GetNotice( nid, pid, verifier, trans_id );
    }
    if ( notice == NULL )
    {
        notice = new CNotice (nid, pid, verifier, name, trans_id, this);
        if (NoticeHead == NULL)
        {
            NoticeHead = NoticeTail = notice;
        }
        else
        {
            NoticeTail = NoticeTail->Link (notice);
        }
    }
    else
    {
        // We have a duplicate registation request for notification.
        // Just return original notice object without error.
        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST))
           trace_printf("%s@%d" " - Already have registered for this notice" "\n", method_name, __LINE__);
    }

    deathInterestLock_.unlock();

    TRACE_EXIT;
    return notice;
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcess::ReplyNewProcess (struct message_def * reply_msg,
                                CProcess * process, int result)
{
    const char method_name[] = "CProcess::ReplyNewProcess";
    TRACE_ENTRY;

    // the parent gets a new_process reply
    reply_msg->type = MsgType_Service;
    reply_msg->noreply = false;
    reply_msg->reply_tag = process->ReplyTag;
    reply_msg->u.reply.type = ReplyType_NewProcess;
    reply_msg->u.reply.u.new_process.nid = process->Nid;
    reply_msg->u.reply.u.new_process.pid = process->Pid;
    reply_msg->u.reply.u.new_process.verifier = process->Verifier;
    strcpy (reply_msg->u.reply.u.new_process.process_name,process->Name);
    reply_msg->u.reply.u.new_process.return_code = result;

    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS | TRACE_SYNC))
        trace_printf("%s@%d - Created process %s (%d, %d:%d), sending reply to "
                     "%s (%d, %d), result=%d\n", method_name, __LINE__,
                     process->Name, process->Nid, process->Pid, process->Verifier,
                     Name, Nid, Pid, result);

    // send reply to the parent
    SQ_theLocalIOToClient->sendCtlMsg
        ( Pid, MC_SReady, ((SharedMsgDef*)reply_msg)-> trailer.index );

    TRACE_EXIT;
}
#endif


#ifndef NAMESERVER_PROCESS
void CProcess::SendProcessCreatedNotice(CProcess *parent, int result)
{
    const char method_name[] = "CProcess::SendProcessCreatedNotice";
    TRACE_ENTRY;

    struct message_def *reply_msg;

    reply_msg = new struct message_def;

    // the parent gets a child started notice
    reply_msg->type = MsgType_ProcessCreated;
    reply_msg->noreply = true;
    reply_msg->u.request.type = ReqType_Notice;
    reply_msg->u.request.u.process_created.nid = Nid;
    reply_msg->u.request.u.process_created.pid = Pid;
    reply_msg->u.request.u.process_created.verifier = Verifier;
    reply_msg->u.request.u.process_created.tag = Tag;
    strcpy(reply_msg->u.request.u.process_created.port, Port);
    strcpy(reply_msg->u.request.u.process_created.process_name, Name);
    reply_msg->u.request.u.process_created.return_code = result;
    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS | TRACE_SYNC))
        trace_printf("%s@%d - Created process %s (%d, %d), sending process "
                     "created notice to %s (%d, %d), result=%d\n",
                     method_name, __LINE__, Name, Nid, Pid,
                     parent->Name, parent->Nid, parent->Pid, result);

    // send notice to the parent
    SQ_theLocalIOToClient->putOnNoticeQueue( parent->Pid
                                           , parent->Verifier
                                           , reply_msg
                                           , NULL);

    TRACE_EXIT;
}
#endif

struct message_def * CProcess::GetDeathNotice( void )
{
    const char method_name[] = "CProcess::GetDeathNotice";
    TRACE_ENTRY;

    struct message_def *notice = NULL;

    ssmpNoticesLock_.lock();
    if ( ! ssmpNotices_.empty() )
    {
        notice = ssmpNotices_.front();
        if ( notice )
        {
            ssmpNotices_.pop_front();
        }
    }
    ssmpNoticesLock_.unlock();

    TRACE_EXIT;

    return notice;
}

void CProcess::PutDeathNotice( struct message_def * notice)
{
    const char method_name[] = "CProcess::PutDeathNotice";
    TRACE_ENTRY;

    ssmpNoticesLock_.lock();
    ssmpNotices_.push_front ( notice );
    ssmpNoticesLock_.unlock();

    TRACE_EXIT;
}

void CProcess::Switch( CProcess *parent )
{
    const char method_name[] = "CProcess::Switch";
    TRACE_ENTRY;

    if (parent)
    {
        if (IsBackup())
        {
            if (GetPid() == parent->GetParentPid())
            {
                // The parent now doesn't have a backup
                parent->SetParentNid ( -1 );
                parent->SetParentPid ( -1 );
                parent->SetParent ( NULL );
            }
            else
            {
               if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                   trace_printf("%s@%d" " - Parent not our primary" "\n", method_name, __LINE__);
            }
        }
        if (parent->IsBackup())
        {
            if (GetPid() == parent->GetParentPid())
            {
                // The parent is now the primary
                parent->SetBackup ( false );
                parent->SetParentNid ( -1 );
                parent->SetParentPid ( -1 );
                parent->SetParent ( NULL );
                if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                    trace_printf("%s@%d" " - Backup taking over, Name=" "%s" "\n", method_name, __LINE__, parent->GetName());
            }
            else
            {
                if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                    trace_printf("%s@%d" " - Parent not our backup" "\n", method_name, __LINE__);
            }
        }
    }

    TRACE_EXIT;
}


CProcessContainer::CProcessContainer (void)
                  :numProcs_(0)
                  ,nodeContainer_(false)
                  ,processNameFormatLong_(true)
                  ,nameMap_(NULL)
                  ,pidMap_(NULL)
                  ,head_(NULL)
                  ,tail_(NULL)
{
    const char method_name[] = "CProcessContainer::CProcessContainer";
    TRACE_ENTRY;

    // Add eyecatcher sequence as a debugging aid
    memcpy(&eyecatcher_, "PCTR", 4);

    //create & initialize existing semaphore
    char sem_name[MAX_PROCESS_PATH];
    snprintf(sem_name,sizeof(sem_name), "/monitor.sem.%s", getenv("USER"));
    Mutex = sem_open(sem_name,O_CREAT,0644,0);
    if(Mutex == SEM_FAILED)
    {
        char buf[MON_STRING_BUF_SIZE];
        snprintf(buf, sizeof(buf), "[%s], Can't create semaphore %s!\n",
                 method_name, sem_name);
        mon_log_write(MON_PROCESSCONT_PROCESSCONT_1, SQ_LOG_ERR, buf);

        sem_unlink(sem_name);
        abort();
    }

#ifndef NAMESERVER_PROCESS
    char *env = getenv("SQ_MON_PROCESS_NAME_FORMAT_LONG");
    if ( env && isdigit(*env) )
    {
        int val = atoi(env);
        processNameFormatLong_ = (val != 0) ? true : false;
    }
#endif

    TRACE_EXIT;
}

CProcessContainer::CProcessContainer( bool nodeContainer )
                  :numProcs_(0)
                  ,nodeContainer_(nodeContainer)
                  ,processNameFormatLong_(true)
                  ,nameMap_(NULL)
                  ,pidMap_(NULL)
                  ,head_(NULL)
                  ,tail_(NULL)
{
    const char method_name[] = "CProcessContainer::CProcessContainer";
    TRACE_ENTRY;

    // Add eyecatcher sequence as a debugging aid
    memcpy(&eyecatcher_, "PCTR", 4);

    //create & initialize existing semaphore
    char sem_name[MAX_PROCESS_PATH];
    snprintf(sem_name,sizeof(sem_name), "/monitor.sem.%s", getenv("USER"));
    Mutex = sem_open(sem_name,O_CREAT,0644,0);
    if(Mutex == SEM_FAILED)
    {
        char buf[MON_STRING_BUF_SIZE];
        int err = errno;
        snprintf(buf, sizeof(buf), "[%s], Can't create semaphore %s! (%s)\n",
                 method_name, sem_name, strerror(err));
        mon_log_write(MON_PROCESSCONT_PROCESSCONT_3, SQ_LOG_ERR, buf);

        err = sem_unlink(sem_name);
        if (err == -1)
        {
            int err = errno;
            snprintf(buf, sizeof(buf), "[%s], Can't unlink semaphore %s! (%s)\n",
                     method_name, sem_name, strerror(err));
            mon_log_write(MON_PROCESSCONT_PROCESSCONT_4, SQ_LOG_ERR, buf);
        }
        abort();
    }

#ifndef NAMESERVER_PROCESS
    char *env = getenv("SQ_MON_PROCESS_NAME_FORMAT_LONG");
    if ( env && isdigit(*env) )
    {
        int val = atoi(env);
        processNameFormatLong_ = (val != 0) ? true : false;
    }
#endif

    if ( nodeContainer_ )
    {
        nameMap_ = new nameMap_t;
        pidMap_ = new pidMap_t;
    }

    TRACE_EXIT;
}

CProcessContainer::~CProcessContainer (void)
{
    const char method_name[] = "CProcessContainer::~CProcessContainer";
    TRACE_ENTRY;

    if ( nodeContainer_ )
    {
        CleanUpProcesses();
        if ( nameMap_ )
        {
            delete nameMap_;
        }
        if ( pidMap_ )
        {
            delete pidMap_;
        }
    }

    sem_close(Mutex);
    char sem_name[MAX_PROCESS_PATH];
    snprintf(sem_name,sizeof(sem_name), "/monitor.sem.%s", getenv("USER"));
    sem_unlink(sem_name);

    // Alter eyecatcher sequence as a debugging aid to identify deleted object
    memcpy(&eyecatcher_, "pctr", 4);

    TRACE_EXIT;
}

void CProcessContainer::AddToPidMap(int pid, CProcess *process)
{
    const char method_name[] = "CProcessContainer::AddToPidMap";
    TRACE_ENTRY;

    if ( ! nodeContainer_ )
    {
        // Programmer bonehead :^)
        // This must only be called from CNode (the physical node)
        abort();
    }

    pair<pidMap_t::iterator, bool> ret;

    if (pid != -1)
    {
        // temp trace, remove once USE_PROCESS_MAPS is default
        if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
        {
            trace_printf("%s@%d inserting into pidMap %p: %d, %s (%d, %d)\n"
                        , method_name, __LINE__
                        , pidMap_, pid
                        , process->GetName(), process->GetNid(), process->GetPid());
        }

        pidMapLock_.lock();
        ret = pidMap_->insert( pidMap_t::value_type ( pid, process ));
        pidMapLock_.unlock();
        if (ret.second == false)
        {   // Already had an entry with the given key value
            if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
            {
                trace_printf("%s@%d pid map already contained process %d\n",
                             method_name, __LINE__, pid);
            }
        }

        // temp trace
        if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
        {
            trace_printf("%s@%d pidMap_ (%p) now has %d entries\n",
                         method_name, __LINE__, pidMap_, (int)pidMap_->size());
        }

    }

    TRACE_EXIT;
}

void CProcessContainer::DelFromPidMap( CProcess *process )
{
    const char method_name[] = "CProcessContainer::DelFromPidMap";
    TRACE_ENTRY;

    pidMapLock_.lock();
    int count = pidMap_->erase ( process->GetPid() );
    pidMapLock_.unlock();

    if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
    {
        if (count != 0)
        {
            trace_printf("%s@%d removed from pidMap %p: %s (%d, %d)\n",
                         method_name, __LINE__, pidMap_,
                         process->GetName(), process->GetNid(), process->GetPid());
        }
    }

    if ( process->GetPid() != process->GetPidAtFork() )
    {   // Process id changed after fork(). [This could happen if, for
        // example, a shell script was the originally started process
        // and it then started the actual process.
        pidMapLock_.lock();
        int count = pidMap_->erase ( process->GetPidAtFork() );
        pidMapLock_.unlock();

        if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
        {
            if (count != 0)
            {
                trace_printf("%s@%d removed from pidMap %p: %s (%d, %d)\n",
                             method_name, __LINE__, pidMap_,
                             process->GetName(), process->GetNid(),
                             process->GetPidAtFork());
            }
        }
    }

    TRACE_EXIT;
}

void CProcessContainer::AddToNameMap( CProcess *process )
{
    const char method_name[] = "CProcessContainer::AddToNameMap";
    TRACE_ENTRY;

    if ( ! nodeContainer_ )
    {
        // Programmer bonehead :^)
        // This must only be called from CNode (the physical node)
        abort();
    }

    pair<nameMap_t::iterator, bool> ret1;

    if ( strlen(process->GetName()) != 0 )
    {

        if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
        {
            trace_printf("%s@%d inserting into nameMap %p: %s (%d, %d)\n", method_name, __LINE__, nameMap_, process->GetName(), process->GetNid(), process->GetPid());
        }

        nameMapLock_.lock();
        ret1 = nameMap_->insert( nameMap_t::value_type ( process->GetName(),
                                                        process ));
        nameMapLock_.unlock();
        if (ret1.second == false)
        {   // Already had an entry with the given key value.  This is not
            // necessarily an error.  One sceario where this can happen is
            // if a new process request contains a user assigned process
            // name and the process is to be created on another node.
            // When the InternalType_ProcInit replication message is
            // processed on the originating node we'll attempt to re-add
            // the name (a system generated name will be added for the first
            // time at this point.)
            if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
            {
                trace_printf("%s@%d nameMap %p already contained process %s\n",
                             method_name, __LINE__, nameMap_, process->GetName());
            }
        }

        // temp trace
        if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
        {
            trace_printf("%s@%d nameMap_ (%p) now has %d entries\n",
                         method_name, __LINE__, nameMap_,
                         (int)nameMap_->size());
        }

    }

    TRACE_EXIT;
}

void CProcessContainer::DelFromNameMap( CProcess *process )
{
    const char method_name[] = "CProcessContainer::DelFromNameMap";
    TRACE_ENTRY;

    if ( ! nodeContainer_ )
    {
        // Programmer bonehead :^)
        // This must only be called from CNode (the physical node)
        abort();
    }

    CProcess *p2 = GetProcess ( process->GetName(), false );
    if ( p2 == NULL)
    {  // Process was not in the map, no need to erase
        if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
        {
            trace_printf("%s@%d not removing from nameMap %p: %s (%d, %d)."
                         "  No such mapping\n",
                         method_name, __LINE__, nameMap_,
                         process->GetName(), process->GetNid(), process->GetPid());
        }
    }
    else if (p2 != process)
    {
        // Name was in map but process object is not what we were expecting
        // so leave it alone
        if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
        {
            trace_printf("%s@%d not removing from nameMap %p: %s (%d, %d)."
                         "  Map contains %s (%d, %d)\n",
                         method_name, __LINE__, nameMap_,
                         process->GetName(), process->GetNid(), process->GetPid(),
                         p2->GetName(), p2->GetNid(), p2->GetPid());
        }
    }
    else
    {

        nameMapLock_.lock();
        int count = nameMap_->erase ( process->GetName() );
        nameMapLock_.unlock();

        if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
        {
            if (count != 0)
            {
                trace_printf("%s@%d removed from nameMap %p: %s (%d, %d)\n",
                             method_name, __LINE__, nameMap_,
                             process->GetName(), process->GetNid(), process->GetPid());
            }
        }
    }

}

void CProcessContainer::AddToList(CProcess *process)
{
    const char method_name[] = "CProcessContainer::AddToList";
    TRACE_ENTRY;

    if ( ! nodeContainer_ )
    {
        // Programmer bonehead :^)
        // This must only be called from CNode (the physical node)
        abort();
    }

    if (process)
    {
        // link it to the CNode container
        if (head_ == NULL)
        {
            head_ = tail_ = process;
            process->prev_ = NULL;
        }
        else
        {
            tail_->next_ = process;
            process->prev_ = tail_;
            tail_ = process;
        }
        process->next_ = NULL;
        numProcs_++;

        // link it to the CLNode container
        CLNode *lnode = Nodes->GetLNode( process->Nid );
        lnode->AddToListL( process );

        if (trace_settings & (TRACE_PROCESS_DETAIL))
        {
            CNode *node = lnode->GetNode();
            trace_printf("%s@%d" " container %p - pnid=%d, process count=%d, pnode=%d" "\n", method_name, __LINE__, this, node->GetPNid(), numProcs_, nodeContainer_);
        }

        AddToNameMap(process);
        if ( process->Pid != -1 )
        {
            AddToPidMap(process->Pid, process);
        }

    }

    TRACE_EXIT;
}

void CProcessContainer::AddToListL(CProcess *process)
{
    const char method_name[] = "CProcessContainer::AddToListL";
    TRACE_ENTRY;

    if ( nodeContainer_ )
    {
        // Programmer bonehead :^)
        // This must only be called from CLNode (the logical node)
        abort();
    }

    if (process)
    {
        // link it to the CLNode container
        if (head_ == NULL)
        {
            head_ = tail_ = process;
            process->prevL_ = NULL;
        }
        else
        {
            tail_->nextL_ = process;
            process->prevL_ = tail_;
            tail_ = process;
        }
        process->nextL_ = NULL;
        numProcs_++;

        if (trace_settings & (TRACE_PROCESS_DETAIL))
        {
            trace_printf("%s@%d" " - container %p nid=%d, process count=%d, pnode=%d" "\n", method_name, __LINE__, this, process->Nid, numProcs_, nodeContainer_);
        }
    }

    TRACE_EXIT;
}

#ifndef NAMESERVER_PROCESS
void CProcessContainer::AttachProcessCheck ( struct message_def *msg )
{
    CProcess *process;
    char la_buf[MON_STRING_BUF_SIZE];

    const char method_name[] = "CProcessContainer::AttachProcessCheck";
    TRACE_ENTRY;

    assert ( msg != NULL);

    if ( msg->u.request.u.startup.startup_size != sizeof(msg->u.request.u.startup) )
    {
        snprintf(la_buf, sizeof(la_buf), "[%s], Startup message from %s has invalid size=%d, expecting size=%d\n",
                 method_name, msg->u.request.u.startup.process_name,
                 msg->u.request.u.startup.startup_size,
                 (int) sizeof(msg->u.request.u.startup));
        mon_log_write(MON_PROCESSCONT_ATTACHPCHECK_1, SQ_LOG_ERR, la_buf);

        abort(); // TODO: revisit
    } else if ((MyNode->GetState() != State_Up &&
                MyNode->GetState() != State_Shutdown) &&
               ( strcmp(msg->u.request.u.startup.program,"shell")!=0 )   )
    {
        // Check if we can accept a connection
        snprintf(la_buf, sizeof(la_buf), "[%s], Can't accept %s because node is logically down\n", method_name, msg->u.request.u.startup.process_name);
        mon_log_write(MON_PROCESSCONT_ATTACHPCHECK_1, SQ_LOG_ERR, la_buf);

        msg->u.reply.type = ReplyType_Generic;
        msg->u.reply.u.generic.nid = -1;
        msg->u.reply.u.generic.pid = -1;
        msg->u.reply.u.generic.verifier = -1;
        msg->u.reply.u.generic.process_name[0] = '\0';
        msg->u.reply.u.generic.return_code = MPI_ERR_OP;
    }

    // shell is trying to attach across all nodes
    else if (msg->u.request.u.startup.paired)
    {
        if (trace_settings & (TRACE_REQUEST | TRACE_SYNC | TRACE_INIT | TRACE_PROCESS))
           trace_printf("%s@%d" " - paired attach" "\n", method_name, __LINE__);
        Nodes->GetLNode (msg->u.request.u.startup.process_name, &process);
        if (process)
        {
            process->SetPaired ( true );
            process->SetClone( false );
            msg->u.reply.type = ReplyType_Generic;
            msg->u.reply.u.generic.nid = process->GetNid();
            msg->u.reply.u.generic.pid = process->GetPid();
            msg->u.reply.u.generic.verifier = process->GetVerifier();
            strcpy (msg->u.reply.u.generic.process_name, process->GetName());
            msg->u.reply.u.generic.return_code = MPI_SUCCESS;
        }
        else
        {
            // Can't find process
            snprintf(la_buf, sizeof(la_buf),
                     "[%s], Can't find or clone Process %s to pair attach!\n",
                     method_name, msg->u.request.u.startup.process_name);
            mon_log_write(MON_PROCESSCONT_ATTACHPCHECK_2, SQ_LOG_ERR, la_buf);

            msg->u.reply.type = ReplyType_Generic;
            msg->u.reply.u.generic.nid = -1;
            msg->u.reply.u.generic.pid = -1;
            msg->u.reply.u.generic.verifier = -1;
            msg->u.reply.u.generic.process_name[0] = '\0';
            msg->u.reply.u.generic.return_code = MPI_ERR_NAME;
        }
    }
    // check if its an attach request, if so setup the process
    else if ((msg->u.request.u.startup.nid == -1) &&
             (msg->u.request.u.startup.pid == -1)   )
    {
        Nodes->GetLNode (msg->u.request.u.startup.process_name, &process);
        if (!process)
        {

            if (trace_settings & (TRACE_REQUEST | TRACE_SYNC | TRACE_PROCESS))
               trace_printf("%s@%d" " - process attaching" "\n", method_name, __LINE__);
            if ( ! nodeContainer_ )
            {
                // Programmer bonehead :^)
                // This must only be called from MyNode (the local physical node)
                abort();
            }
            if ( ! MyNode->IsSpareNode() )
            {
                int nid = MyNode->AssignNid();
                if ( (nid == -1) && (MyNode->GetState() != State_Up) )
                {
                    snprintf( la_buf, sizeof(la_buf),
                            "[%s], Can't attach the pid %d (program: %s) - the monitor is not up yet (curr state: %d).\n",
                            method_name,
                            msg->u.request.u.startup.os_pid,
                            msg->u.request.u.startup.program,
                            MyNode->GetState() );
                    mon_log_write( MON_PROCESSCONT_ATTACHPCHECK_4, SQ_LOG_ERR, la_buf );

                    msg->u.reply.type = ReplyType_Generic;
                    msg->u.reply.u.generic.nid = -1;
                    msg->u.reply.u.generic.pid = -1;
                    msg->u.reply.u.generic.verifier = -1;
                    msg->u.reply.u.generic.process_name[0] = '\0';
                    msg->u.reply.u.generic.return_code = MPI_ERR_NAME;
                }
                else
                {
                    strId_t progStrId = MyNode->GetStringId( msg->u.request.u.startup.program );
                    strId_t nullStrId = { -1, -1 };
                    process =
                        new CProcess( NULL, nid, msg->u.request.u.startup.os_pid, ProcessType_Generic, 0, 0, false, true, (char *) "",
                        nullStrId, nullStrId, progStrId, (char *) "", (char *) "" );
                    if ( process == NULL )
                    {
                        //TODO: Log event
                        abort();
                    }
                    if ( process )
                    {
                        char user_argv[MAX_ARGS][MAX_ARG_SIZE];
                        process->userArgs( 0, user_argv );
                    }
                    if ( msg->u.request.u.startup.process_name[0] == '\0' )
                    {   // Create a name for the process and place it in the
                        // Name member of the process object);
                        char pname[MAX_KEY_NAME];
                        MyNode->BuildOurName( nid, process->GetPid( ), pname );
                        process->SetName( pname );
                    }
                    else
                    {
                        process->SetName(
                            MyNode->NormalizeName( msg->u.request.u.startup.process_name ) );
                    }
                    process->SetAttached( true );
                    process->SetupFifo( process->GetNid( ), msg->u.request.u.startup.os_pid );
                    process->SetCreationTime( msg->u.request.u.startup.os_pid );
                    process->SetVerifier( ); // CProcessContainer::AttachProcessCheck
                    AddToList( process );
                    process->CompleteProcessStartup( msg->u.request.u.startup.port_name, // CProcessContainer::AttachProcessCheck
                                                     msg->u.request.u.startup.os_pid,
                                                     msg->u.request.u.startup.event_messages,
                                                     msg->u.request.u.startup.system_messages,
                                                     false,
                                                     NULL,
                                                     MyPNID );

                    msg->u.reply.type = ReplyType_Startup;
                    msg->u.reply.u.startup_info.nid = process->GetNid( );
                    msg->u.reply.u.startup_info.pid = process->GetPid( );
                    msg->u.reply.u.startup_info.verifier = process->GetVerifier( );
                    strcpy( msg->u.reply.u.startup_info.process_name, process->GetName( ) );
                    msg->u.reply.u.startup_info.return_code = MPI_SUCCESS;
                    STRCPY( msg->u.reply.u.startup_info.fifo_stdin,
                            process->fifo_stdin() );
                    STRCPY( msg->u.reply.u.startup_info.fifo_stdout,
                            process->fifo_stdout() );
                    STRCPY( msg->u.reply.u.startup_info.fifo_stderr,
                            process->fifo_stderr() );

                    Monitor->writeProcessMapBegin( process->GetName( )
                                                 , process->GetNid( )
                                                 , process->GetPid( )
                                                 , process->GetVerifier( )
                                                 , -1, -1, -1
                                                 , msg->u.request.u.startup.program );
                }
            }
            else
            {
                snprintf( la_buf, sizeof(la_buf),
                        "[%s], Can't attach, node is a spare node!\n",
                        method_name );
                mon_log_write( MON_PROCESSCONT_ATTACHPCHECK_3, SQ_LOG_ERR, la_buf );

                msg->u.reply.type = ReplyType_Startup;
                msg->u.reply.u.startup_info.nid = -1;
                msg->u.reply.u.startup_info.pid = -1;
                msg->u.reply.u.startup_info.verifier = -1;
                msg->u.reply.u.startup_info.process_name[0] = '\0';
                msg->u.reply.u.startup_info.return_code = MPI_ERR_NO_MEM;
            }
        }
        else
        {
            // Find the duplicate process
            snprintf( la_buf, sizeof(la_buf),
                     "[%s], Can't attach duplicate process %s!\n",
                     method_name, msg->u.request.u.startup.process_name );
            mon_log_write( MON_PROCESSCONT_ATTACHPCHECK_4, SQ_LOG_ERR, la_buf );

            msg->u.reply.type = ReplyType_Generic;
            msg->u.reply.u.generic.nid = -1;
            msg->u.reply.u.generic.pid = -1;
            msg->u.reply.u.generic.verifier = -1;
            msg->u.reply.u.generic.process_name[0] = '\0';
            msg->u.reply.u.generic.return_code = MPI_ERR_NAME;
        }
    }
    // complete a monitor child process startup
    else
    {
        if (trace_settings & (TRACE_REQUEST | TRACE_SYNC | TRACE_PROCESS))
           trace_printf("%s@%d" " - child attach" "\n", method_name, __LINE__);
        Monitor->CompleteProcessStartup(msg);
    }

    TRACE_EXIT;
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcessContainer::Bcast (struct message_def *msg)
{
    CProcess *process = NULL;
    SharedMsgDef *shm = NULL;
    SQ_LocalIOToClient::bcastPids_t *bcastPids = NULL;
    unsigned int msgSize;

    const char method_name[] = "CProcessContainer::Bcast";
    TRACE_ENTRY;

    // Prepare a broadcast notice for sending by the local io "pending
    // notice thread".   Do this by formatting an image of the message
    // to be sent along with a the list of process ids that will receive
    // the notice.
    pidMapLock_.lock();
    pidMap_t::iterator pidMapIt;
    for ( pidMapIt = pidMap_->begin(); pidMapIt != pidMap_->end() ; pidMapIt++ )
    {
        process = pidMapIt->second;
        assert( process );
        if (process->IsSystemMessages() &&
            process->GetState() == State_Up)
        {
            if (trace_settings & (TRACE_REQUEST_DETAIL | TRACE_RECOVERY | TRACE_SYNC_DETAIL | TRACE_TMSYNC | TRACE_PROCESS_DETAIL))
                trace_printf( "%s@%d - Send notice to %s (%d, %d:%d)\n"
                            , method_name, __LINE__
                            , process->GetName()
                            , process->GetNid()
                            , process->GetPid()
                            , process->GetVerifier() );

            if (!shm)
            {   // First process, allocate a buffer for the notice image
                // and initialize it.
                shm = new SharedMsgDef;
                memset( &shm->trailer, 0, sizeof(shm->trailer) );
                bcastPids = new SQ_LocalIOToClient::bcastPids_t;
                assert(bcastPids);

                msgSize = SQ_theLocalIOToClient->getSizeOfMsg( msg );

                if ( msgSize > sizeof ( message_def ) )
                {   // Not expected to occur but guard against client
                    // buffer overrun
                    msgSize = sizeof ( message_def );
                }

                memcpy( &shm->msg, msg, msgSize );
                shm->trailer.OSPid = BCAST_PID;
                shm->trailer.verifier = -1;
            }
            // Add this process id to the list.
            SQ_LocalIOToClient::pidVerifier_t pv;
            pv.pv.pid = process->GetPid();
            pv.pv.verifier = process->GetVerifier();
            bcastPids->insert( pv.pnv );
        }
    }
    pidMapLock_.unlock();

    if (shm)
    {
        SQ_theLocalIOToClient->putOnNoticeQueue( BCAST_PID
                                               , -1
                                               , &shm->msg
                                               , bcastPids);
    }

    TRACE_EXIT;
}
#endif

char *CProcessContainer::BuildOurName( int nid, int pid, char *name )
{
    const char method_name[] = "CProcessContainer::BuildOurName";
    TRACE_ENTRY;

    int i;
    int rem;
    int cnt[6];

    if (!processNameFormatLong_)
    {
        // Convert Pid into base 35 acsii
        cnt[0] = pid / 42875;    // (35 * 35 * 35)
        rem = pid - ( cnt[0] * 42875 );
        cnt[1] = rem / 1225;     // (35 * 35)
        rem -= ( cnt[1] * 1225 );
        cnt[2] = rem / 35;
        rem -= ( cnt[2] * 35 );
        cnt[3] = rem;
    
        // Process name format long: '$Zxxpppp' xx = nid, pppp = pid

        // Convert Nid into base 16 acsii
        sprintf(name,"$Z%2.2X",nid);

        // Convert Pid into base 36 ascii
        for(i=3; i>=0; i--)
        {
            if( cnt[i] < 10 )
            {
                name[i+4] = '0'+cnt[i];
            }
            else
            {
                cnt[i] -= 10;
                // we are skipping cap 'o' because it looks like zero.
                if( cnt[i] >= 14 )
                {
                    name[i+4] = 'P'+(cnt[i]-14);
                }
                else
                {
                    name[i+4] = 'A'+cnt[i];
                }
            }
        }
        name[8] = '\0';
    }
    else
    {
        // We are skipping 'A', 'I', 'O', and 'U' to distinguish between zero
        // and one digits, and for political correctness in generated names
        char b32table[32] =  {'0','1','2','3','4','5','6','7','8','9'
                             ,'B','C','D','E','F','G','H','J','K','L','M'
                             ,'N','P','Q','R','S','T','V','W','X','Y','Z' };
    
        // Convert Pid into base 32 ascii
        cnt[0] = pid / 33554432;    // (32 * 32 * 32 * 32 * 32)
        rem = pid - ( cnt[0] * 33554432 );
        cnt[1] = rem / 1048576;     // (32 * 32 * 32 * 32)
        rem -= ( cnt[1] * 1048576 );
        cnt[2] = rem / 32768;       // (32 * 32 * 32)
        rem -= ( cnt[2] * 32768 );
        cnt[3] = rem / 1024;        // (32 * 32)
        rem -= ( cnt[3] * 1024 );
        cnt[4] = rem / 32;
        rem -= ( cnt[4] * 32 );
        cnt[5] = rem;
    
        // Process name format long: '$Zxxxxpppppp' xxxx = nid, pppppp = pid
    
        // Convert Nid into base 16 ascii
        sprintf(name,"$Z%4.4X",nid);
    
        // Convert Pid into base 32 ascii
        for(i=5; i>=0; i--)
        {
            name[i+6] = static_cast<char>(b32table[cnt[i]]);
        }
        name[12] = '\0';
    }

    TRACE_EXIT;
    return name;
}

#ifndef NAMESERVER_PROCESS
bool CProcessContainer::CancelDeathNotification( int nid
                                               , int pid
                                               , int verifier
                                               , _TM_Txid_External trans_id)
{
    bool status = FAILURE;
    CProcess *process = head_;

    // we will loop through all processes on the node ... return FAILURE
    // only if we don't find any notices to cancel.
    while (process)
    {
        status = process->CancelDeathNotification (nid, pid, verifier, trans_id);
        process = process->GetNext ();
    }

    return status;
}
#endif

#ifndef NAMESERVER_PROCESS
// Child_Exit terminates all child processes created by the parent process
// unless the child process is Unhooked from the parent process
void CProcessContainer::Child_Exit ( CProcess * parent )
{
    CProcess *process;

    const char method_name[] = "CProcessContainer::Child_Exit";
    TRACE_ENTRY;
    if (trace_settings & TRACE_ENTRY_EXIT)
        trace_printf("%s@%d with parent (%d, %d)\n", method_name, __LINE__, parent->GetNid(), parent->GetPid() );

    if ( parent &&
           ((MyNode->GetState() != State_Shutdown &&
             MyNode->GetShutdownLevel() == ShutdownLevel_Undefined)
           || (parent->GetType() == ProcessType_SPX) ) )
    {
        CProcess::nidPid_t child;
        CLNode * childLNode;

        while ( parent->childRemoveFirst ( child ))
        {

            childLNode = Nodes->GetLNode( child.nid );
            process = (childLNode != NULL )
                         ? childLNode->GetNode()->GetProcess( child.pid ) : NULL;

            if ( process && (!process->IsUnhooked()) )
            {

                if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                    trace_printf("%s@%d - Child process %s (%d, %d) exits due "
                                 "to parent death (%d, %d)\n",
                                 method_name, __LINE__, process->GetName(),
                                 process->GetNid(), process->GetPid(),
                                 parent->GetNid(), parent->GetPid());

                childLNode->SetProcessState( process, State_Down, true );
                if ( !process->IsClone() )
                {
                    if ( parent->GetType() == ProcessType_SPX )
                    {
                        kill (process->GetPid(), SIGKILL);
                    }
                    else
                    {
                        kill (process->GetPid(), Monitor->GetProcTermSig());
                    }
                }
                else
                {
                    if (NameServerEnabled)
                    {
                        CNode* childNode = childLNode->GetNode();
                        // Forward the process kill to the target node
                        int rc = PtpClient->ProcessKill( process
                                                       , process->GetAbort()
                                                       , childLNode->GetNid()
                                                       , childNode->GetName() );
                        if (rc)
                        {
                            char la_buf[MON_STRING_BUF_SIZE];
                            snprintf( la_buf, sizeof(la_buf)
                                    , "[%s] - Can't send process kill "
                                      "request for child process %s (%d, %d) "
                                      "to child node %s, nid=%d\n"
                                    , method_name
                                    , process->GetName()
                                    , process->GetNid()
                                    , process->GetPid()
                                    , childNode->GetName()
                                    , childLNode->GetNid() );
                            mon_log_write(MON_PROCESSCONT_CHILDEXIT_1, SQ_LOG_ERR, la_buf);
                        }
                    }
                }
                
                if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                    trace_printf( "%s@%d - Completed kill for child process %s (%d, %d)\n"
                                , method_name, __LINE__
                                , process->GetName()
                                , process->GetNid()
                                , process->GetPid());
            }
            else
            {
                if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                {
                    if (process)
                    {
                        trace_printf("%s@%d - Child process %s (%d, %d), not "
                                    "killed, state=%d, unhooked=%d\n",
                                    method_name, __LINE__, process->GetName(),
                                    process->GetNid(), process->GetPid(),
                                    process->GetState(), process->IsUnhooked());

                    }
                }

            }
        }
    }
    TRACE_EXIT;
}

void CProcessContainer::ChildUnHooked_Exit( CProcess* parent )
{
    const char method_name[] = "CProcessContainer::ChildUnHooked_Exit";
    TRACE_ENTRY;

    CProcess *process;

    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
        trace_printf( "%s@%d with parent %s (%d,%d:%d)\n"
                    , method_name, __LINE__
                    , parent->GetName()
                    , parent->GetNid()
                    , parent->GetPid()
                    , parent->GetVerifier() );

    if (NameServerEnabled)
    {
        if ( parent && !parent->IsClone()
           && ((MyNode->GetState() != State_Shutdown
             && MyNode->GetShutdownLevel() == ShutdownLevel_Undefined)) )
        {
            CProcess::nidPid_t child;
            CLNode* childLNode;

            while ( parent->childUnHookedRemoveFirst( child ))
            {
                childLNode = Nodes->GetLNode( child.nid );
                process = (childLNode != NULL )
                             ? childLNode->GetNode()->GetProcess( child.pid ) : NULL;
                if (process)
                {
                    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                    {
                        trace_printf( "%s@%d - Telling unhooked child process %s (%d,%d:%d) "
                                      "of parent death %s (%d,%d:%d)\n"
                                    , method_name, __LINE__
                                    , process->GetName()
                                    , process->GetNid()
                                    , process->GetPid()
                                    , process->GetVerifier()
                                    , parent->GetName()
                                    , parent->GetNid()
                                    , parent->GetPid()
                                    , parent->GetVerifier() );
                    }
    
                    CNode* childNode = childLNode->GetNode();
                    // Forward the parent's process exit to the child's node
                    int rc = PtpClient->ProcessExit( parent
                                                   , childLNode->GetNid()
                                                   , childNode->GetName() );
                    if (rc)
                    {
                        char la_buf[MON_STRING_BUF_SIZE];
                        snprintf( la_buf, sizeof(la_buf)
                                , "[%s] - Can't send process exit "
                                  "request for parent process %s (%d,%d:%d) "
                                  "to child's node %s, nid=%d\n"
                                , method_name
                                , parent->GetName()
                                , parent->GetNid()
                                , parent->GetPid()
                                , parent->GetVerifier()
                                , childNode->GetName()
                                , childLNode->GetNid() );
                        mon_log_write(MON_PROCESSCONT_CHILDEXIT_1, SQ_LOG_ERR, la_buf);
                    }
                    else
                    {
                        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                            trace_printf( "%s@%d - Completed kill for parent process %s (%d,%d:%d)\n"
                                        , method_name, __LINE__
                                        , parent->GetName()
                                        , parent->GetNid()
                                        , parent->GetPid()
                                        , parent->GetVerifier() );
                    }
                }
            }
        }
    }
    TRACE_EXIT;
}
#endif

void CProcessContainer::CleanUpProcesses( void )
{
    CProcess *process = head_;

    const char method_name[] = "CProcessContainer::CleanUpProcesses";
    TRACE_ENTRY;

    while (process)
    {
        DelFromNameMap ( process );
        DelFromPidMap ( process );

        DeleteFromList(process);
        process = head_;
    }
    numProcs_ = 0;

    if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
       trace_printf("%s@%d" " - process count=%d" "\n", method_name, __LINE__, numProcs_);

    TRACE_EXIT;
}

CProcess *CProcessContainer::CloneProcess (int nid,
                                           PROCESSTYPE type,
                                           int priority,
                                           int backup,
                                           bool unhooked,
                                           char *process_name,
                                           char *port,
                                           int os_pid,
                                           int verifier,
                                           int parent_nid,
                                           int parent_pid,
                                           int parent_verifier,
                                           bool event_messages,
                                           bool system_messages,
#ifdef NAMESERVER_PROCESS
                                           char *path, 
                                           char *ldpath, 
                                           char *program, 
#else
                                           strId_t pathStrId,
                                           strId_t ldpathStrId,
                                           strId_t programStrId,
#endif
                                           char *infile,
                                           char *outfile,
                                           struct timespec *creation_time,
                                           int origPNidNs)
{
    char pname[MAX_PROCESS_NAME];
    CProcess *process;
    CProcess *parent = NULL;
    char la_buf[MON_STRING_BUF_SIZE];

    const char method_name[] = "CProcessContainer::CloneProcess";
    TRACE_ENTRY;

    // load & normalize process name
    if( process_name[0] == '\0' )
    {
        pname[0] = '\0';
    }
    else
    {
        STRCPY (pname, NormalizeName (process_name));
    }

    if (parent_nid != -1)
    {
        parent = Nodes->GetLNode (parent_nid)->GetProcessL(parent_pid);
    }

    if (backup)
    {
        if (!parent)
        {
            snprintf(la_buf, sizeof(la_buf),
                     "[%s], Failed, Backup does not have parent's name.\n",
                     method_name);
            mon_log_write(MON_PROCESSCONT_CLONEPROCESS_1, SQ_LOG_ERR, la_buf);
            return NULL;
        }
        if (parent_nid == nid)
        {
            snprintf(la_buf, sizeof(la_buf),
                     "[%s], Failed, Backup can't be in parent's node.\n",
                     method_name);
            mon_log_write(MON_PROCESSCONT_CLONEPROCESS_2, SQ_LOG_ERR, la_buf);
            return NULL;
        }
    }
    else
    {
        if (pname[0] != '\0')
        {
            Nodes->GetLNode (pname, &process);
            if (process)
            {
                snprintf(la_buf, sizeof(la_buf),
                         "[%s], Failed, Duplicate processname (%s).\n",
                         method_name, process_name);
                mon_log_write(MON_PROCESSCONT_CLONEPROCESS_3, SQ_LOG_ERR, la_buf);
                return NULL;
            }
        }
    }
    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL))
        trace_printf("%s@%d - Process name=%s (%d, %d), port=%s, "
                     "parent (%d, %d)\n", method_name,
                     __LINE__, pname, nid, os_pid, port, parent_nid, parent_pid);

#ifdef NAMESERVER_PROCESS
    process =
        new CProcess( parent
                    , nid
                    , os_pid
                    , verifier
                    , type
                    , priority
                    , backup
                    , false
                    , unhooked
                    , pname
                    , path
                    , ldpath
                    , program
                    , infile
                    , outfile);
#else
    process =
        new CProcess (parent, nid, os_pid, type, priority, backup, false, unhooked, pname, pathStrId, ldpathStrId,
                      programStrId, infile, outfile);
#endif

    if (process)
    {
        process->SetVerifier(verifier); // CProcessContainer::CloneProcess
        process->SetParentVerifier(parent_verifier);

        AddToList( process );

        process->CompleteProcessStartup (port, os_pid, event_messages, system_messages, os_pid==-1, creation_time, origPNidNs); // CProcessContainer::CloneProcess
    }

    TRACE_EXIT;
    return process;
}


CProcess *CProcessContainer::CompleteProcessStartup (char *process_name,
                                                     char *port,
                                                     int os_pid,
                                                     bool event_messages,
                                                     bool system_messages,
                                                     struct timespec *creation_time,
                                                     int origPNidNs)
{
    CProcess *process = NULL;

    const char method_name[] = "CProcessContainer::CompleteProcessStartup";
    TRACE_ENTRY;

    if ( nodeContainer_ )
    {
        process = GetProcess(process_name,false);
    }
    else
    {
        // Not supposed to be able to get here.
        abort();

    }
    if (process)
    {
        if (process->GetPid() != os_pid)
        { // Process id changed from when we started the process.
#ifndef NAMESERVER_PROCESS
            if ( !process->IsUnhooked() )
            {   // Parent process object keeps track of child processes
                // created on this node.  Needed in case parent process
                // exits abnormally.
                int parentNid;
                int parentPid;
                if ( ! process->IsBackup() )
                {
                    parentNid = process->GetParentNid();
                    parentPid = process->GetParentPid();
                }
                else
                {
                    parentNid = process->GetPairParentNid();
                    parentPid = process->GetPairParentPid();
                }

                if ( parentNid != -1 && parentPid != -1 )
                {
                    CProcess* parent;
                    parent = Nodes->GetLNode ( parentNid )
                                ->GetProcessL( parentPid );
                    if ( parent && !process->IsBackup() )
                    {
                        parent->childRemove ( process->GetNid(),
                                              process->GetPid() );
                        parent->childAdd ( process->GetNid(), os_pid );
                    }
                }
            }
            if (NameServerEnabled)
            {
                if (process->IsUnhooked())
                {   // Parent process object keeps track of child processes
                    // created. Needed when parent process exits to clean up
                    // parent clone process object in remote nodes.
                    int parentNid;
                    int parentPid;
                    CProcess* parent;
                    if ( !process->IsBackup() )
                    {
                        parentNid = process->GetParentNid();
                        parentPid = process->GetParentPid();
                    }
                    else
                    {
                        parentNid = process->GetPairParentNid();
                        parentPid = process->GetPairParentPid();
                    }
    
                    if ( parentNid != -1 && parentPid != -1 )
                    {
                        parent = Nodes->GetLNode(parentNid)->GetProcessL(parentPid);
                        if ( parent && !parent->IsClone() && !process->IsBackup() )
                        {
                            parent->childUnHookedRemove( process->GetNid()
                                                       , process->GetPid() );
                            parent->childUnHookedAdd( process->GetNid()
                                                    , os_pid );
                        }
                    }
                }
            }
#endif
            // Process id changed from when we started the process.  So
            // remap using the new pid.  [This could happen if, for example,
            // a shell script was the originally started process and it
            // then started the process that is now sending its startup message]
            if (trace_settings & TRACE_PROCESS)
            {
                trace_printf("%s@%d - process id changed, new pid at process"
                             " startup=%d, original pid=%d\n",
                             method_name, __LINE__, os_pid,
                             process->GetPid() );
            }
            AddToPidMap ( os_pid, process );
        }
        process->CompleteProcessStartup (port, os_pid, event_messages, system_messages, false, creation_time, origPNidNs); // CProcessContainer::CompleteProcessStartup
    }
    // When using process maps do not log an error if the process is
    // not found.  This method can be called from
    // CCluster::HandleOtherNodeMsg to check if process exists.
    TRACE_EXIT;
    return process;
}

#ifndef NAMESERVER_PROCESS
CProcess *CProcessContainer::CreateProcess (CProcess * parent,
                                            int nid,
                                            PROCESSTYPE type,
                                            int debug,
                                            int priority,
                                            int backup,
                                            bool unhooked,
                                            char *process_name,
                                            strId_t pathStrId,
                                            strId_t ldpathStrId,
                                            strId_t programStrId,
                                            char *infile,
                                            char *outfile,
                                            void *tag,
                                            int &result)
{
    CProcess *process = NULL;
    char la_buf[MON_STRING_BUF_SIZE];

    const char method_name[] = "CProcessContainer::CreateProcess";
    TRACE_ENTRY;

    result = MPI_SUCCESS;

    // load & normalize process name
    if( process_name[0] != '\0' )
    {
        NormalizeName (process_name);
    }

    if (backup)
    {
        if ( !parent || (strcmp (parent->GetName(), process_name) != 0) )
        {
            snprintf(la_buf, sizeof(la_buf),
                     "[%s], Failed, Backup does not have parent's name.\n",
                     method_name);
            mon_log_write(MON_PROCESSCONT_CREATEPROCESS_1, SQ_LOG_ERR, la_buf);

            result = MPI_ERR_NAME;

            return NULL;
        }
        if (parent->GetNid() == nid)
        {
            snprintf(la_buf, sizeof(la_buf),
                     "[%s], Failed, Backup can't be in parent's node.\n",
                     method_name);
            mon_log_write(MON_PROCESSCONT_CREATEPROCESS_2, SQ_LOG_ERR, la_buf);

            result = MPI_ERR_RANK;

            return NULL;
        }
    }
    else
    {
        Nodes->GetLNode (process_name, &process, false);
        if (process)
        {
            snprintf(la_buf, sizeof(la_buf),
                     "[%s], Failed, Duplicate processname (%s).\n",
                     method_name, process_name);
            mon_log_write(MON_PROCESSCONT_CREATEPROCESS_3, SQ_LOG_ERR, la_buf);

            result = MPI_ERR_NAME;
            return NULL;
        }
    }

    process =
        new CProcess (parent, nid, -1, type, priority, backup, debug, unhooked, process_name,
                      pathStrId, ldpathStrId, programStrId, infile, outfile);
    if (process)
    {
        AddToList( process );
        if (type == ProcessType_NameServer ||
            type == ProcessType_Watchdog ||
            type == ProcessType_PSD ||
            type == ProcessType_SMS )
        {
            if (type == ProcessType_NameServer)
            {
                process->userArgs ( monitorArgc, monitorArgv );
            }
            if (process->Create (parent, tag, result)) // monitor
            {
                AddToPidMap(process->GetPid(), process);
            }
        }
        else if ( type == ProcessType_SSMP )
        {
            Nodes->GetLNode ( nid )->SetSSMProc ( process );
        }
    }
    TRACE_EXIT;

    return process;
}
#endif

#ifdef NAMESERVER_PROCESS
void CProcessContainer::DeleteAllDown()
{
    CProcess *process  = NULL;
    int nid = -1;
    int pid = -1;

    const char method_name[] = "CProcessContainer::DeleteAllDown";
    TRACE_ENTRY;

    nameMap_t::iterator nameMapIt;

    while ( true )
    {
        nameMapLock_.lock();
        nameMapIt = nameMap_->begin();

        if (nameMap_->size() == 0)
        {
            nameMapLock_.unlock();
            break; // all done
        }

        process = nameMapIt->second;

        // Delete name map entry
        nameMap_->erase (nameMapIt);

        nameMapLock_.unlock();

        nid = process->GetNid();
        pid = process->GetPid();

        if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
        {
            trace_printf("%s@%d removed from nameMap %p: %s (%d, %d)\n",
                         method_name, __LINE__, nameMap_,
                         process->GetName(), nid, pid);
        }

        // Delete pid map entry
        DelFromPidMap ( process );

        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
        {
            trace_printf( "%s@%d - Completed delete for %s (%d, %d)\n"
                        , method_name, __LINE__
                        , process->GetName(), nid, pid);
        }

        // Remove all processes
        // PSD will re-create persistent processes on spare node activation
        Exit_Process( process, true, nid );
    }

    TRACE_EXIT;
}
#endif

void CProcessContainer::DeleteFromList( CProcess *process )
{
    const char method_name[] = "CProcessContainer::DeleteFromList";
    TRACE_ENTRY;

    if ( ! nodeContainer_ )
    {
        // Programmer bonehead :^)
        // This must only be called from CNode (the physical node)
        abort();
    }

    if (process)
    {
        RemoveFromList( process );

        if (process->replRefCount() == 0)
        {   // Process object is not in replication queue so ok to
            // delete.
            if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
            {
                trace_printf("%s@%d - Deleting process %s (%d, %d)\n", method_name, __LINE__, process->Name, process->Nid, process->Pid );
            }
            delete process;
        }
        else
        {   // Process object is in replication queue.  Replication
            // queueing logic will delete the object once the replication
            // has completed.   Set the state here to indicate that
            // the object is no longer on the process list.
            process->SetState (State_Unlinked);
            if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
            {
                trace_printf("%s@%d - Setting process %s (%d, %d) state to State_Unlinked\n", method_name, __LINE__, process->Name, process->Nid, process->Pid );
            }
        }
    }

    TRACE_EXIT;
}

void CProcessContainer::RemoveFromList( CProcess *process )
{
    const char method_name[] = "CProcessContainer::RemoveFromList";
    TRACE_ENTRY;

    if ( ! nodeContainer_ )
    {
        // Programmer bonehead :^)
        // This must only be called from CNode (the physical node)
        abort();
    }

    if (process)
    {
        CLNode *lnode = Nodes->GetLNode( process->Nid );
        lnode->RemoveFromListL( process );

        if (head_ == process)
            head_ = process->next_;
        if (tail_ == process)
            tail_ = process->prev_;
        if (process->prev_)
            process->prev_->next_ = process->next_;
        if (process->next_)
            process->next_->prev_ = process->prev_;
        numProcs_--;
        if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
        {
            CNode *node = lnode->GetNode();
            trace_printf("%s@%d" " - container %p pnid=%d, process count=%d, pnode=%d" "\n", method_name, __LINE__, this, node->GetPNid(), numProcs_, nodeContainer_);
        }

    }

    TRACE_EXIT;
}

void CProcessContainer::RemoveFromListL( CProcess *process )
{
    const char method_name[] = "CProcessContainer::RemoveFromListL";
    TRACE_ENTRY;

    if ( nodeContainer_ )
    {
        // Programmer bonehead :^)
        // This must only be called from CLNode (the logical node)
        abort();
    }

    if (process)
    {

        if (head_ == process)
            head_ = process->nextL_;
        if (tail_ == process)
            tail_ = process->prevL_;
        if (process->prevL_)
            process->prevL_->nextL_ = process->nextL_;
        if (process->nextL_)
            process->nextL_->prevL_ = process->prevL_;
        numProcs_--;
        if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
        {
            trace_printf("%s@%d" " - container %p nid=%d, process count=%d, pnode=%d" "\n", method_name, __LINE__, this, process->Nid, numProcs_, nodeContainer_);
        }
    }

    TRACE_EXIT;
}

#ifndef NAMESERVER_PROCESS
bool CProcessContainer::Dump_Process (CProcess *dumper, CProcess *process, char *core_path)
{
    bool status;

    const char method_name[] = "CProcessContainer::Dump_Process";
    TRACE_ENTRY;

    status = process->Dump(dumper, core_path);

    TRACE_EXIT;
    return status;
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcessContainer::DumpCallback( int nid, pid_t pid, int status )
{
    const char method_name[] = "CProcessContainer::DumpCallback";
    TRACE_ENTRY;

    if ( nodeContainer_ )
    {
        // Programmer bonehead :^)
        // This must only be called from CLNode (the logical node)
        abort();
    }

    CLNode   *lnode = Nodes->GetLNode( nid );
    CNode    *node = lnode->GetNode();

    CProcess *process = node->GetProcess( pid );
    if ( process )
    {
        if (WIFEXITED(status) && (WEXITSTATUS(status) == 0))
        {
            if (trace_settings & TRACE_PROCESS)
            {
                trace_printf("%s@%d - dump successful, nid=%d, pid=%d\n",
                             method_name, __LINE__, nid, pid );
            }
            process->SetDumpStatus( Dump_Success );
        }
        else
        {
            if (trace_settings & TRACE_PROCESS)
            {
                trace_printf("%s@%d - dump failed, nid=%d, pid=%d\n",
                             method_name, __LINE__, nid, pid );
            }
            process->SetDumpStatus( Dump_Failed );
        }
        process->SetDumpState( Dump_Complete );

        CReplDumpComplete *repl = new CReplDumpComplete( process );
        Replicator.addItem(repl);
    }
    else
    {
        if (trace_settings & TRACE_PROCESS)
        {
            trace_printf("%s@%d - dump process not found, nid=%d, pid=%d\n",
                         method_name, __LINE__, nid, pid );
        }
    }

    TRACE_EXIT;
}
#endif


#ifndef NAMESERVER_PROCESS
CProcess * CProcessContainer::ParentNewProcReply ( CProcess *process, int result )
{
    const char method_name[] = "CProcessContainer::ParentNewProcReply";
    TRACE_ENTRY;

    CProcess *parent = NULL;

    if (process->GetParentNid() != -1)
    {
        parent = Nodes->GetProcess( process->GetParentNid(),
                                    process->GetParentPid() );
    }

    // If we have a parent process then it is expecting a reply
    if (parent && !parent->IsClone() && !parent->IsPaired())
    {
        if (!process->IsNowait())
        {   // The new process request was "waited" so send reply now
            struct message_def *reply_msg;
            reply_msg = process->parentContext();

            if ( reply_msg )
            {
                // send reply to the parent
                parent->ReplyNewProcess ( reply_msg, process, result );
                // Since we have replied parent context (i.e the request
                // buffer) is no longer valid.
                process->parentContext( NULL );
            }
        }
        else
        {   // The new process request was "no-wait" so send notice now
            process->SendProcessCreatedNotice(parent, result);
        }
    }

    TRACE_EXIT;

    return parent;
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcessContainer::Exit_Process (CProcess *process, bool abend, int downNode)
{
    bool restarted = false;
    char la_buf[MON_STRING_BUF_SIZE];
    CProcess *parent = NULL;

    const char method_name[] = "CProcessContainer::Exit_Process(process)";
    TRACE_ENTRY;

    if (process)
    {
        if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
            trace_printf( "%s@%d - Process %s (abended=%d) is exiting, abend=%d, downNode=%d\n"
                        , method_name, __LINE__
                        , process->GetName()
                        , process->IsAbended()
                        , abend
                        , downNode );

        if ( process->GetState() == State_Down && abend && !process->IsAbended() )
        {
            process->SetAbended( abend );
        }
        if (process->GetNid() == downNode && !process->IsAbended() )
        {
            process->SetAbended( abend );
        }

        if ( numProcs_ <= 0 )
        {
            snprintf(la_buf, sizeof(la_buf),
                     "[%s], Node's process count is invalid, aborting\n",
                     method_name);
            mon_log_write(MON_PROCESSCONT_EXITPROCESS_1, SQ_LOG_ERR, la_buf);
            abort();
        }

        if ( process->GetState() == State_Stopped )
        {
            if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                trace_printf("%s@%d" " - Process " "%s" " already exited." "\n", method_name, __LINE__, process->GetName());
            return;
        }

        if (!process->IsStartupCompleted())
        {
            parent = ParentNewProcReply ( process, MPI_ERR_SPAWN );

            char buf[MON_STRING_BUF_SIZE];
            snprintf(buf, sizeof(buf),
                     "[%s], Exiting process %s (%d, %d) did not complete "
                     "startup\n",
                     method_name, process->GetName(), process->GetNid(),
                     process->GetPid());
            mon_log_write(MON_PROCESSCONT_EXITPROCESS_2, SQ_LOG_ERR, buf);
        }

        if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
            trace_printf( "%s@%d - Process %s is exiting, persistent=%d, abended=%d\n"
                        , method_name, __LINE__
                        , process->GetName()
                        , process->IsPersistent()
                        , process->IsAbended() );

        if ( process->IsPersistent() &&
            (process->IsAbended() || process->GetType() == ProcessType_SPX))
        {
            Child_Exit(process);
        }

        if (!process->IsClone() && NameServerEnabled)
        {
            if (process->childUnHookedCount() > 0)
            {
                ChildUnHooked_Exit(process);
            }
        }
    
        if ( parent == NULL)
        {
            parent = Nodes->GetProcess( process->GetParentNid(),
                                        process->GetParentPid() );
        }

        // Unregister any interest in other process' death
        _TM_Txid_External transid;
        transid = invalid_trans();
        process->procExitUnregAll( transid );

        // Handle the process termination
        process->Exit( parent );

        process->Switch( parent ); // switch process pair roles if needed

        if ( process->IsPersistent() &&
             process->GetAbort() == false &&
            !MyNode->IsActivatingSpare() &&
            !MyNode->IsKillingNode() &&
             MyNode->GetShutdownLevel() == ShutdownLevel_Undefined &&
            (process->IsAbended()||
             process->GetNid() == downNode ||
             process->GetType() == ProcessType_SPX))
        {
            // see if we can restart the process
            restarted = RestartPersistentProcess( process, downNode );
            if ( !restarted )
            {
                if (!process->IsClone() && !MyNode->isInQuiesceState())
                {
                    // Replicate the exit to other nodes
                    if (!NameServerEnabled)
                    {
                        // Replicate the exit to other nodes
                        CReplExit *repl = new CReplExit(process->GetNid(),
                                                    process->GetPid(),
                                                    process->GetVerifier(),
                                                    process->GetName(),
                                                    process->IsAbended());
                        Replicator.addItem(repl);
                    }
                }
                else
                {
                    if (trace_settings & TRACE_SYNC)
                    {
                        trace_printf("%s@%d - not queuing process exit for clone %s\n", method_name, __LINE__, process->GetName());
                    }
                }
                if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                    trace_printf("%s@%d" " - Persistent Process " "%s" " did not re-start on nid=" "%d" "\n", method_name, __LINE__, process->GetName(), process->GetNid());

                CNode * node;
                node = Nodes->GetLNode(process->GetNid())->GetNode();
                node->DeleteFromList( process );
            }
        }
        else
        {
            process->SetState (State_Stopped);
            if ( !process->IsClone() &&
                 (!MyNode->IsKillingNode() || MyNode->IsSoftNodeDown()) &&
                 !MyNode->isInQuiesceState() &&
                 !(process->GetType() == ProcessType_DTM &&
                   process->IsAbended() &&
                   MyNode->GetShutdownLevel() == ShutdownLevel_Undefined) )
            {
                if (!NameServerEnabled)
                {
                    // Replicate the exit to other nodes
                    CReplExit *repl = new CReplExit(process->GetNid(),
                                                process->GetPid(),
                                                process->GetVerifier(),
                                                process->GetName(),
                                                process->IsAbended());
                    Replicator.addItem(repl);
                }
            }
            else
            {
                if (trace_settings & TRACE_SYNC)
                {
                    trace_printf("%s@%d - not queuing process exit for clone %s\n", method_name, __LINE__, process->GetName());
                }
            }
            process->SetDeletePending ( true );
            if (process->IsAbended() || process->GetType() == ProcessType_SPX)
            {
                Child_Exit(process);
            }

            if (!process->IsClone() && process->GetType() == ProcessType_Watchdog)
            {
                HealthCheck.setState(HC_UPDATE_WATCHDOG, (long long)NULL);
            }
            CNode * node;
            node = Nodes->GetLNode(process->GetNid())->GetNode();
            node->DeleteFromList( process );

        }
    }
    TRACE_EXIT;

    return;
}
#endif

#ifdef NAMESERVER_PROCESS
void CProcessContainer::Exit_Process (CProcess *process, bool abend, int downNode)
{
    const char method_name[] = "CProcessContainer::Exit_Process(process)";
    TRACE_ENTRY;

    char la_buf[MON_STRING_BUF_SIZE];
    CProcess *parent = NULL;

    if (process)
    {
        if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
            trace_printf( "%s@%d - Process %s (abended=%d) is exiting, abend=%d, downNode=%d\n"
                        , method_name, __LINE__
                        , process->GetName()
                        , process->IsAbended()
                        , abend
                        , downNode );

        if ( process->GetState() == State_Down && abend && !process->IsAbended() )
        {
            process->SetAbended( abend );
        }
        if (process->GetNid() == downNode && !process->IsAbended() )
        {
            process->SetAbended( abend );
        }

        if ( numProcs_ <= 0 )
        {
            snprintf(la_buf, sizeof(la_buf),
                     "[%s], Node's process count is invalid, aborting\n",
                     method_name);
            mon_log_write(MON_PROCESSCONT_EXITPROCESS_1, SQ_LOG_ERR, la_buf);
            abort();
        }

        if ( process->GetState() == State_Stopped )
        {
            if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                trace_printf("%s@%d" " - Process " "%s" " already exited." "\n", method_name, __LINE__, process->GetName());
            return;
        }

        if ( parent == NULL)
        {
            parent = Nodes->GetProcess( process->GetParentNid(),
                                        process->GetParentPid() );
        }

        // Handle the process termination
        process->Switch( parent ); // switch process pair roles if needed
        process->SetDeletePending ( true );

        CNode *node;
        node = Nodes->GetLNode(process->GetNid())->GetNode();
        node->DelFromNameMap ( process );
        node->DelFromPidMap ( process );
        node->DeleteFromList( process );
    }
    TRACE_EXIT;

    return;
}
#endif

CProcess *CProcessContainer::GetProcess (int pid)
{
    const char method_name[] = "CProcessContainer::GetProcess (pid)";
    TRACE_ENTRY;

    if ( ! nodeContainer_ )
    {
        // Programmer bonehead :^)
        // This must only be called from CNode (the physical node)
        abort();
    }

    pidMap_t::iterator it;
    CProcess *entry = NULL;

    pidMapLock_.lock();
    it = pidMap_->find(pid);
    if (it != pidMap_->end())
    {
        entry = it->second;

        // bugcatcher, temp call
        entry->validateObj();
    }
    pidMapLock_.unlock();

    if (trace_settings & TRACE_PROCESS_DETAIL)
    {
        trace_printf("%s@%d - pidmap_ (%p) entry=%p, pid=%d, Name=%s\n",
                     method_name, __LINE__, pidMap_, entry, pid,
                     ((entry != NULL) ? entry->GetName(): ""));
    }

    TRACE_EXIT;
    return entry;
}

CProcess *CProcessContainer::GetProcess (const char *name, bool checkstate)
{
    const char method_name[] = "CProcessContainer::GetProcess (name)";
    TRACE_ENTRY;

    if ( ! nodeContainer_ )
    {
        // Programmer bonehead :^)
        // This must only be called from CNode (the physical node)
        abort();
    }

    nameMap_t::iterator it;
    CProcess *entry = NULL;

    if ( ! strlen( name ) )
    {
        TRACE_EXIT;
        return entry;
    }
    char pname[MAX_PROCESS_NAME];
    strncpy(pname, name, MAX_PROCESS_NAME);
    pname[MAX_PROCESS_NAME-1] = '\0';

    NormalizeName (pname);

    // Look up name in process-name-to-process-object map.
    nameMapLock_.lock();
    it = nameMap_->find( pname );

    if (it != nameMap_->end())
    {
        entry = it->second;

        // bugcatcher, temp call
        entry->validateObj();

        if (trace_settings & TRACE_PROCESS_DETAIL)
            trace_printf("%s@%d - Name=%s, checkstate=%d, state=%d, backup=%d\n",
                         method_name, __LINE__, entry->GetName(), checkstate,
                         entry->GetState(), entry->IsBackup());

        if ( checkstate &&  entry->GetState() != State_Up)
        {   // Only return entry if it has completed startup
            if (trace_settings & TRACE_PROCESS)
                trace_printf( "%s@%d - Process %s (%d,%d:%d) not in 'Up' state"
                              ", checkstate=%d, state=%d, backup=%d\n"
                            ,  method_name, __LINE__
                            , entry->GetName()
                            , entry->GetNid()
                            , entry->GetPid()
                            , entry->GetVerifier()
                            , checkstate
                            , entry->GetState()
                            , entry->IsBackup());
            entry = NULL;
        }
    }
    nameMapLock_.unlock();

    TRACE_EXIT;

    return entry;
}

CProcess *CProcessContainer::GetProcess( int pid
                                       , Verifier_t verifier
                                       , bool checkstate )
{
    const char method_name[] = "CProcessContainer::GetProcess(pid, verifier)";
    TRACE_ENTRY;

    if ( ! nodeContainer_ )
    {
        // Programmer bonehead :^)
        // This must only be called from CNode (the physical node)
        abort();
    }

    CProcess *entry = NULL;

    if ( pid != -1 )
    {
        entry = CProcessContainer::GetProcess( pid );
    }

    if ( entry )
    {
        if ( (verifier != -1) && (verifier != entry->GetVerifier()) )
        {
           if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
           {
              trace_printf( "%s@%d - Get (%d:%d) failed -- verifier mismatch (%d)\n"
                          , method_name, __LINE__
                          , pid
                          , verifier
                          , entry->GetVerifier() );
           }
           entry = NULL;
        }
    }

    if ( entry && checkstate &&  entry->GetState() != State_Up)
    {   // Only return entry if it has completed startup
        if (trace_settings & TRACE_PROCESS)
            trace_printf( "%s@%d - Process %s (%d,%d:%d) not in 'Up' state"
                          ", checkstate=%d, state=%d, backup=%d\n"
                        ,  method_name, __LINE__
                        , entry->GetName()
                        , entry->GetNid()
                        , entry->GetPid()
                        , entry->GetVerifier()
                        , checkstate
                        , entry->GetState()
                        , entry->IsBackup());
        entry = NULL;
    }

    TRACE_EXIT;
    return entry;
}

CProcess *CProcessContainer::GetProcess( const char *name
                                       , Verifier_t verifier
                                       , bool checkstate )
{
    const char method_name[] = "CProcessContainer::GetProcess(name, verifier)";
    TRACE_ENTRY;

    if ( ! nodeContainer_ )
    {
        // Programmer bonehead :^)
        // This must only be called from CNode (the physical node)
        abort();
    }

    CProcess *entry = NULL;

    if ( strlen( name ) )
    {
        entry = CProcessContainer::GetProcess( name, checkstate );
    }

    if ( entry )
    {
        if ( (verifier != -1) && (verifier != entry->GetVerifier()) )
        {
           if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
           {
              trace_printf( "%s@%d - Get (%s:%d) failed -- verifier mismatch (%d)\n"
                          , method_name, __LINE__
                          , name
                          , verifier
                          , entry->GetVerifier() );
           }
           entry = NULL;
        }
    }

    TRACE_EXIT;
    return entry;
}

CProcess *CProcessContainer::GetProcessByType (PROCESSTYPE type)
{
    CProcess *entry = head_;

    const char method_name[] = "CProcessContainer::GetProcessByType";
    TRACE_ENTRY;

    if ( ! nodeContainer_ )
    {
        // Programmer bonehead :^)
        // This must only be called from CNode (the physical node)
        abort();
    }

    entry = entry->GetProcessByType( type );
    TRACE_EXIT;

    return entry;
}

// see: CLNode::GetProcessL (int pid)
// see: CLNode::GetProcessL (char *name, bool checkstate)

CProcess *CProcessContainer::GetProcessLByType(PROCESSTYPE type)
{
    CProcess *entry = head_;

    const char method_name[] = "CProcessContainer::GetProcessByType";
    TRACE_ENTRY;

    if ( nodeContainer_ )
    {
        // Programmer bonehead :^)
        // This must only be called from CLNode (the logical node)
        abort();
    }

    entry = entry->GetProcessLByType( type );

    TRACE_EXIT;

    return entry;
}

#ifndef NAMESERVER_PROCESS
void CProcessContainer::KillAll( STATE node_State, CProcess *requester )
{
    CProcess *process = NULL;
    int nid;

    const char method_name[] = "CProcessContainer::KillAll";
    TRACE_ENTRY;

    nameMapLock_.lock();

    nameMap_t::iterator nameMapIt;
    nameMap_t::iterator nameMapItSave;
    for ( nameMapIt = nameMap_->begin(); nameMapIt != nameMap_->end(); )
    {
        process = nameMapIt->second;
        assert( process );
        nameMapItSave = nameMapIt;
        ++nameMapIt;

        nid = process->GetNid();
        if (( process->GetType() != ProcessType_Watchdog ) &&
            ( process != requester                  )   )
        {
            if (node_State == State_Down)
            {
                int killedNid = process->GetNid();
                int killedPid = process->GetPid();
                bool killedIsClone = process->IsClone();

                // Delete name map entry
                nameMap_->erase(nameMapItSave);

                if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
                {
                    trace_printf("%s@%d removed from nameMap %p: %s (%d, %d)\n",
                                 method_name, __LINE__, nameMap_,
                                 process->GetName(), killedNid,
                                 killedPid);
                }

                // Delete pid map entry
                DelFromPidMap ( process );

                // Set process to "stopped" state.  SetProcessState
                // will invoke Exit_Process so "process" is not
                // valid after SetProcessState returns.
                SetProcessState( process, State_Stopped, true, -1);
                if ( nid == killedNid )
                {
                    if ( !killedIsClone && killedPid != -1)
                    {
                        kill (killedPid, SIGKILL);
                        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                            trace_printf("%s@%d - Completed kill for (%d, %d)\n", method_name, __LINE__, killedNid, killedPid);
                    }
                }
            }
            else
            {
                if (trace_settings & (TRACE_REQUEST_DETAIL | TRACE_RECOVERY | TRACE_SYNC_DETAIL | TRACE_TMSYNC | TRACE_PROCESS_DETAIL))
                    trace_printf("%s@%d change process (%d, %d) state to down\n", method_name, __LINE__, process->GetNid(), process->GetPid());
                process->SetState (State_Down);
                // Replicate the kill to other nodes
                CReplKill *repl = new CReplKill( process->GetNid()
                                               , process->GetPid()
                                               , process->GetVerifier()
                                               , process->GetAbort());
                Replicator.addItem(repl);
            }
        }
    }

    nameMapLock_.unlock();

    TRACE_EXIT;
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcessContainer::KillAllDown()
{
    CProcess *process  = NULL;
    int nid = -1;
    int pid = -1;

    const char method_name[] = "CProcessContainer::KillAllDown";
    TRACE_ENTRY;

    nameMap_t::iterator nameMapIt;

    while ( true )
    {
        nameMapLock_.lock();
        nameMapIt = nameMap_->begin();

        if (nameMap_->size() == 0)
        {
            nameMapLock_.unlock();
            break; // all done
        }

        process = nameMapIt->second;

        // Delete name map entry
        nameMap_->erase (nameMapIt);

        nameMapLock_.unlock();

        nid = process->GetNid();
        pid = process->GetPid();

        if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
        {
            trace_printf("%s@%d removed from nameMap %p: %s (%d, %d)\n",
                         method_name, __LINE__, nameMap_,
                         process->GetName(), nid, pid);
        }

        // Delete pid map entry
        DelFromPidMap ( process );

        // valid for virtual cluster only.
        if ( !process->IsClone() && pid != -1 )
        {
            // killing the process will not remove the process object because
            // exit processing will get queued until this completes.
            kill( pid, SIGKILL );
            PROCESSTYPE type = process->GetType();
            if ( type == ProcessType_TSE ||
                 type == ProcessType_ASE )
            {
                // unmount volume would acquire nameMapLock_ internally.
                Devices->UnMountVolume( process->GetName(), process->IsBackup() );
            }
            if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                trace_printf("%s@%d - Completed kill for %s (%d, %d)\n", method_name, __LINE__, process->GetName(), nid, pid);
        }

        // Remove all processes
        // PSD will re-create persistent processes on spare node activation
        Exit_Process( process, true, nid );
    }

    // clean up clone processes on this node that do not have entries in
    // nameMap_ or pidMap_ yet and restart persistent processes
    CProcess *nextProc = NULL;
    process = head_;

    while (process)
    {
        nextProc = process->GetNext();

        // Delete pid map entry
        DelFromPidMap ( process );

        Exit_Process( process, true, nid );

        process = nextProc;
    }

    TRACE_EXIT;
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcessContainer::KillAllDownSoft()
{
    const char method_name[] = "CProcessContainer::KillAllDownSoft";
    TRACE_ENTRY;

    CProcess *process  = NULL;
    int nid = -1;
    int pid = -1;
    PROCESSTYPE type;
    nameMap_t::iterator nameMapIt;

    while ( true )
    {
        nameMapLock_.lock();
        nameMapIt = nameMap_->begin();

        if (nameMap_->size() == 0)
        {
            nameMapLock_.unlock();
            break; // all done
        }

        process = nameMapIt->second;

        // Delete name map entry
        nameMap_->erase (nameMapIt);

        nameMapLock_.unlock();

        nid = process->GetNid();
        pid = process->GetPid();
        type = process->GetType();

        if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL))
        {
            trace_printf("%s@%d removed from nameMap %p: %s (%d, %d)\n",
                         method_name, __LINE__, nameMap_,
                         process->GetName(), nid, pid);
        }

        // valid for virtual cluster or soft node down only.
        if ( type != ProcessType_DTM && type != ProcessType_NameServer )
        {
            // Delete pid map entry
            DelFromPidMap ( process );

            // valid for virtual cluster only.
            if ( !process->IsClone() && pid != -1 )
            {
                // killing the process will not remove the process object because
                // exit processing will get queued until this completes.
                kill( pid, SIGKILL );
                PROCESSTYPE type = process->GetType();
                if ( type == ProcessType_TSE ||
                     type == ProcessType_ASE )
                {
                    // unmount volume would acquire nameMapLock_ internally.
                    Devices->UnMountVolume( process->GetName(), process->IsBackup() );
                }
                if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                    trace_printf("%s@%d - Completed kill for %s (%d, %d)\n", method_name, __LINE__, process->GetName(), nid, pid);
            }
            // Remove all processes
            // PSD will re-create persistent processes on spare node activation
            Exit_Process( process, true, nid );
        }
    }

    // clean up clone processes on this node that do not have entries in
    // nameMap_ or pidMap_ yet and restart persistent processes
    CProcess *nextProc = NULL;
    process = head_;

    while (process)
    {
        nextProc = process->GetNext();

        PROCESSTYPE type = process->GetType();
        if ( type != ProcessType_DTM && type != ProcessType_NameServer )
        {
            // Delete pid map entry
            DelFromPidMap ( process );

            Exit_Process( process, true, nid );
        }

        process = nextProc;
    }

    TRACE_EXIT;
}
#endif

char *CProcessContainer::NormalizeName (char *name)
{
    char *ptr;

    const char method_name[] = "CProcessContainer::NormalizeName";
    TRACE_ENTRY;
    ptr = name;
    while (*ptr)
    {
        *ptr = toupper (*ptr);
        ptr++;
    }
    TRACE_EXIT;

    return name;
}

#ifndef NAMESERVER_PROCESS
bool CProcessContainer::Open_Process (int nid, int pid, Verifier_t verifier, int death_notification, CProcess * process)
{
    bool status = FAILURE;
    CProcess *opener_process = NULL;
    char la_buf[MON_STRING_BUF_SIZE];

    const char method_name[] = "CProcessContainer::Open_Process";
    TRACE_ENTRY;
    if (process)
    {
        opener_process = Nodes->GetLNode (nid)->GetProcessL(pid);
        if (opener_process)
        {
            if ( (verifier != -1) && (verifier != process->GetVerifier()) )
            {
                if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
                {
                   trace_printf("%s@%d - Opener (%d, %d:%d) not found -- verifier mismatch (%d)\n",
                                method_name, __LINE__,
                                nid,
                                pid,
                                verifier,
                                opener_process->GetVerifier());
                }
            }
            else
            {
                status = opener_process->Open (process,death_notification);
            }
        }
        else
        {
            snprintf(la_buf, sizeof(la_buf),
                     "[%s], Failed, Can't find opener process, Pid=%d.\n",
                     method_name, pid);
            mon_log_write(MON_PROCESSCONT_OPENPROCESS_1, SQ_LOG_ERR, la_buf);
        }
    }
    else
    {
        snprintf(la_buf, sizeof(la_buf),
                "[%s], Failed, Can't find process.\n", method_name);
        mon_log_write(MON_PROCESSCONT_OPENPROCESS_2, SQ_LOG_ERR, la_buf);
    }
    TRACE_EXIT;

    return status;
}
#endif

#ifdef NAMESERVER_PROCESS
bool CProcessContainer::RestartPersistentProcess( CProcess *, int )
{
    return false;
}
#else
//
// Persistent process re-creation logic:
//
// o Process object is target of re-create
// o Process object type determines persist configuration template
// o Persist configuration template determines re-creation rules
//
//  Re-creation rules:
//
//  PROCESS_NAME format defines (%nid+/%nid) node re-creation scope
//          $<prefix>%nid+ or $<prefix>%nid or $<name>
//
//  PERSIST_ZONES format defines (%zid+/%zid) rules of re-creation within scope
//
//  (%nid+) Nid_ALL = one process in each node
//              Zid_ALL       = n/a
//              Zid_RELATIVE  = recreate only in initial <nid> assigned
//  (%nid ) Nid_RELATIVE = one process in cluster
//              Zid_ALL       = recreate in current up <nid> or next up <nid>
//              Zid_RELATIVE  = recreate only in initial <nid> assigned (non-HA)
//  (     ) Nid_Undefined = one process in cluster
//              Zid_ALL       = recreate in current up <nid> or next up <nid>
//              Zid_RELATIVE  = recreate only in initial <nid> assigned (non-HA)
//
bool CProcessContainer::RestartPersistentProcess( CProcess *process, int downNid )
{
    const char method_name[] = "CProcessContainer::RestartPersistentProcess";
    TRACE_ENTRY;

    bool successful = false;
    bool restart = false;
    int nid = -1;
    int max_retries = 3;
    int retry_max_time = 1;
    CNode *currenNode;
    CNode *newNode;
    CLNode *currentLNode;
    CLNode *newLNode;
    CProcess *parent = NULL;
    CClusterConfig *clusterConfig = Nodes->GetClusterConfig();
    CPersistConfig *persistConfig = NULL;

    assert(clusterConfig != NULL);

    persistConfig = clusterConfig->GetPersistConfig( process->GetType()
                                                   , process->GetName()
                                                   , process->GetNid() );
    if (persistConfig)
    {
        max_retries = persistConfig->GetPersistRetries();
        retry_max_time = persistConfig->GetPersistWindow();
    }
    else
    {
        char buf[MON_STRING_BUF_SIZE];
        snprintf( buf, sizeof(buf)
                , "[%s], Persistent process %s not "
                  "restarted because the persist configuration is "
                  "missing.\n"
                , method_name
                , process->GetName() );
        mon_log_write(MON_PROCESS_PERSIST_2, SQ_LOG_ERR, buf);
        return false;
    }

    // if 1st time retrying to restart process
    if (process->GetPersistentCreateTime() == 0)
    {
        process->SetFirstInstance(false);
        process->SetPersistentCreateTime ( time(NULL) );
    }

    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
       trace_printf( "%s@%d - Persistent process retries = %d, "
                     "time limit = %d, down nid=%d\n"
                   , method_name, __LINE__
                   , max_retries, retry_max_time, downNid);

    // get the parent process if any
    if (process->GetParentNid() != -1 && process->GetParentPid() != -1)
    {
        parent = Nodes->GetLNode( process->GetParentNid())->GetProcessL(process->GetParentPid() );
    }

    currentLNode = Nodes->GetLNode( process->GetNid() );
    newLNode     = Nodes->GetLNodeNext( process->GetNid() );

    switch (persistConfig->GetProcessNameNidFormat())
    {
    case Nid_ALL:       // one process in each <nid>
        switch (persistConfig->GetZoneZidFormat())
        {
        case Zid_ALL:       // n/a
            char buf[MON_STRING_BUF_SIZE];
            snprintf( buf, sizeof(buf)
                    , "[%s], Persistent process %s not "
                      "restarted because the persist configuration is "
                      "inconsistent for key %s.\n"
                    , method_name
                    , process->GetName()
                    , persistConfig->GetPersistPrefix() );
            mon_log_write(MON_PROCESS_PERSIST_2, SQ_LOG_ERR, buf);
            return false;
        case Zid_RELATIVE:  // recreate only in initial <nid> assigned
        default:
            // Is this a node down and node going down is process' node?
            if ( downNid != -1 && currentLNode->GetNid() == downNid )
            {
                if (trace_settings &
                    (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                    trace_printf( "%s@%d - original node is not available, nid=%d, downNid=%d\n"
                                , method_name, __LINE__
                                , currentLNode->GetNid()
                                , downNid );
            }
            else
            {
                if ( currentLNode->GetState() == State_Up)
                {
                    if (trace_settings &
                        (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                    {
                        trace_printf( "%s@%d - original node is available, nid=%d\n"
                                    , method_name, __LINE__, process->GetNid());
                    }

                    if ( MyNode->IsMyNode(process->GetNid()) )
                    {
                        restart = true;
                    }
                }
                else
                {
                    if (trace_settings &
                        (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                        trace_printf( "%s@%d - original node is not available, nid=%d, downNid=%d\n"
                                    , method_name, __LINE__
                                    , currentLNode->GetNid()
                                    , downNid );
                }
            }
        } // switch
        break;
    case Nid_RELATIVE:  // one process in cluster
    case Nid_Undefined: // one process in cluster
    default:
        switch (persistConfig->GetZoneZidFormat())
        {
        case Zid_ALL:       // recreate in current up <nid> or next up <nid>
            // check if we need to do something because the node is down and
            // spare node is not activating
            if ((downNid != -1 && !currentLNode->GetNode()->IsSpareNode()) ||
                 currentLNode->GetState() == State_Down )
            {
                nid = (newLNode) ? newLNode->GetNid() : -1;
                if ( newLNode &&
                    (newLNode->GetState() == State_Up &&
                     newLNode->GetNid() != downNid ) )
                {
                    if (MyNode->IsMyNode(nid))
                    {
                        // OK we need to move the process to our node
                        if (trace_settings &
                            (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                            trace_printf( "%s@%d - Moving process from nid=%d to new nid=%d\n"
                                        , method_name, __LINE__
                                        , process->GetNid(), nid);
                        currenNode = currentLNode->GetNode();
                        currenNode->RemoveFromList(process);
                        process->SetNid ( nid );
                        process->SetPid ( -1 );
                        newNode = newLNode->GetNode();
                        newNode->AddToList( process );
                        process->SetClone( false );
                        // Replicate the clone to other nodes
                        CReplClone *repl = new CReplClone(process);
                        Replicator.addItem(repl);
                        restart = true;
                    }
                    else
                    {
                        if (trace_settings &
                            (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                            trace_printf( "%s@%d - Not moving process from nid=%d to nid=%d""\n"
                                        , method_name, __LINE__
                                        , process->GetNid(), nid);
                    }
                }
                else
                {
                    if (trace_settings &
                        (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                       trace_printf( "%s@%d - Next possible node is not available, nid=%d\n"
                                    , method_name, __LINE__, nid);
                }
            }
            else
            {
                if (trace_settings &
                    (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                    trace_printf( "%s@%d - original node is available, nid=%d\n"
                                , method_name, __LINE__, process->GetNid());

                if ( MyNode->IsMyNode(process->GetNid()) )
                {
                    restart = true;
                }
            }
            break;
        case Zid_RELATIVE:  // recreate only in initial <nid> assigned (non-HA)
        default:
            // Is this a node down and node going down is process' node?
            if ( downNid != -1 && currentLNode->GetNid() == downNid )
            {
                if (trace_settings &
                    (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                    trace_printf( "%s@%d - original node is not available, nid=%d, downNid=%d\n"
                                , method_name, __LINE__
                                , currentLNode->GetNid()
                                , downNid );
            }
            else
            {
                if ( currentLNode->GetState() == State_Up)
                {
                    if (trace_settings &
                        (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                    {
                        trace_printf( "%s@%d - original node is available, nid=%d\n"
                                    , method_name, __LINE__, process->GetNid());
                    }

                    if ( MyNode->IsMyNode(process->GetNid()) )
                    {
                        restart = true;
                    }
                }
                else
                {
                    if (trace_settings &
                        (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                        trace_printf( "%s@%d - original node is not available, nid=%d, downNid=%d\n"
                                    , method_name, __LINE__
                                    , currentLNode->GetNid()
                                    , downNid );
                }
            }
        }
        break;
    }

    if ( Nodes->IsShutdownActive() )
    {
        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
            trace_printf("%s@%d" " - Shutdown process " "%s" " on nid=" "%d"   "\n", method_name, __LINE__, process->GetName(), process->GetNid());
        successful = false;
    }
    else
    {
        // Re-initialize process flags
        process->SetState (State_Unknown);

        if (( restart                    ) &&
            ( MyNode->IsMyNode(process->GetNid()) ))
        {
            // check if we should retry to create the process
            if ( (time(NULL) - process->GetPersistentCreateTime()) < retry_max_time )
            {
                int retryCount = process->GetPersistentRetries();
                if ( retryCount < max_retries )
                {
                    ++retryCount;
                    process->SetPersistentRetries ( retryCount );
                }
                else
                {
                    if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                        trace_printf("%s@%d - Max retries exceeded for "
                                     "process %s, retry count=%d, max "
                                     "retries=%d\n", method_name, __LINE__,
                                     process->GetName(), retryCount,
                                     max_retries);

                    char buf[MON_STRING_BUF_SIZE];

                    snprintf(buf, sizeof(buf), "[%s], Persistent process %s "
                             "not restarted because the maximum retry count "
                             "(%d) has been exceeded.\n",
                             method_name, process->GetName(), retryCount);
                    mon_log_write(MON_PROCESS_PERSIST_1, SQ_LOG_INFO, buf);

                    if ( process->GetType() == ProcessType_DTM ||
                         process->GetType() == ProcessType_PSD ||
                         process->GetType() == ProcessType_TMID ||
                         process->GetType() == ProcessType_Watchdog ||
                         process->GetType() == ProcessType_SMS )
                    {
                        if ( process->GetType() == ProcessType_DTM )
                        {
                            MyNode->SetDTMAborted( true );
                        }
                        if ( process->GetType() == ProcessType_SMS )
                        {
                            MyNode->SetSMSAborted( true );
                        }

                        snprintf(buf, sizeof(buf), "[%s], Critial persistent process %s "
                                 "not restarted, "
                                 "scheduling node down on node %s (%d)!\n",
                                 method_name, process->GetName(), MyNode->GetName(), MyPNID);
                        mon_log_write(MON_PROCESS_PERSIST_4, SQ_LOG_CRIT, buf);

                        ReqQueue.enqueueDownReq(MyPNID);
                    }

                    return false;
                }
            }
            else
            {
                process->SetPersistentRetries ( 0 );
                if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                    trace_printf("%s@%d" " - Retries count reset for process " "%s" "\n", method_name, __LINE__, process->GetName());
            }

            if ( process->GetType() == ProcessType_DTM )
            {
                // Kill all local processes
                Monitor->SoftNodeDown( MyPNID );
            }

            // OK ... just restart the process on the same node
            if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                trace_printf("%s@%d" " - Restarting process " "%s" " on nid=" "%d"   "\n", method_name, __LINE__, process->GetName(), process->GetNid());
            process->SetDeletePending ( false );
            process->SetStartupCompleted ( false );
            process->SetPriorPid( !MyNode->IsSpareNode() ? process->GetPid() : 0 );
            process->SetClone( false );
            int result;
            successful = process->Create(parent, 0, result);
            if (successful)
            {
                process->SetAbended( false );
                Nodes->GetLNode (process->GetNid())->GetNode()
                    ->AddToNameMap(process);
                Nodes->GetLNode (process->GetNid())->GetNode()
                    ->AddToPidMap(process->GetPid(), process);
                process->SetPersistentCreateTime ( time(NULL) );
                if ( process->GetType() == ProcessType_SSMP )
                {
                    Nodes->GetLNode ( process->GetNid() )->SetSSMProc ( process );
                }
            }
            else
            {
                if ( process->GetType() == ProcessType_DTM )
                {
                    char buf[MON_STRING_BUF_SIZE];
                    snprintf( buf, sizeof(buf)
                            , "[%s], DTM (%s) persistent restart failed, Node %s going down\n"
                            , method_name, process->GetName(), MyNode->GetName());
                    mon_log_write(MON_PROCESS_PERSIST_6, SQ_LOG_INFO, buf);

                    snprintf( buf, sizeof(buf),
                              "DTM (%s) persistent restart failed, Node %s going down\n",
                              process->GetName(), MyNode->GetName());
                    genSnmpTrap( buf );

                    // DTM just died unexpectedly, so bring the node down
                    Monitor->HardNodeDown(MyPNID, true);
                }
            }
        }
        else
        {
            if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
                trace_printf("%s@%d" " - Not restarting process " "%s" " on nid=" "%d"   "\n", method_name, __LINE__, process->GetName(), process->GetNid());
            successful = restart;
        }
    }

    TRACE_EXIT;

    return successful;
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcessContainer::PidHangupSet ( int pid )
{
    hungupPidsLock_.lock();
    hungupPids_.insert ( pid );
    hungupPidsLock_.unlock();
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcessContainer::PidHangupClear ( int pid )
{
    hungupPidsLock_.lock();
    hungupPids_.erase ( pid );
    hungupPidsLock_.unlock();
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcessContainer::CheckFdState ( int fd )
{
    const char method_name[   ] = "CProcessContainer::CheckFdState";
    char buf[MON_STRING_BUF_SIZE];

    int epollfd = epoll_create(5);
    if (epollfd == -1)
    {
        snprintf(buf, sizeof(buf), "[%s], epoll_create error, %s (%d)\n",
                method_name, strerror(errno), errno);
        mon_log_write(MON_PROCESS_CHECKFDSTATE_1, SQ_LOG_ERR, buf);

        return;
    }

    // Add file descriptor to epoll set
    struct epoll_event ev;
    memset(&ev, 0, sizeof(ev));
    ev.events = EPOLLIN;
    ev.data.fd = fd;

    if ((epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev) == -1)
        && (errno != EEXIST))
    {
        snprintf(buf, sizeof(buf),
                 "[%s], epoll_ctl error, adding fd=%d, %s (%d)\n",
                method_name, fd, strerror(errno), errno);
        mon_log_write(MON_PROCESS_CHECKFDSTATE_2, SQ_LOG_ERR, buf);

        return;
    }

    // see if hangup is still asserted on stderr
    struct epoll_event event_list[10];
    int ready_fds = epoll_wait (epollfd, event_list, 10, 0);

    if (ready_fds == -1)
    {  // epoll_wait error
        snprintf(buf, sizeof(buf), "[%s], epoll_wait error, %s (%d)\n",
                method_name, strerror(errno), errno);
        mon_log_write(MON_PROCESS_CHECKFDSTATE_3, SQ_LOG_ERR, buf);
    }
    else if (ready_fds != 0)
    {
        for (int n=0; n < ready_fds; n++)
        {
            snprintf(buf, sizeof(buf),
                     "[%s], for fd=%d, events=%d\n", method_name,
                     event_list[n].data.fd, event_list[n].events);
            mon_log_write(MON_PROCESS_CHECKFDSTATE_4, SQ_LOG_INFO, buf);
        }
    }
    else
    {  // Indicate the epoll hangup no longer asserted
        snprintf(buf, sizeof(buf),
                 "[%s], No events pending for fd=%d\n", method_name, fd);
        mon_log_write(MON_PROCESS_CHECKFDSTATE_5, SQ_LOG_INFO, buf);
    }

    close( epollfd );
}
#endif

#ifndef NAMESERVER_PROCESS
void CProcessContainer::PidHangupCheck ( time_t now )
{
    const char method_name[   ] = "CProcessContainer::PidHangupCheck";
    TRACE_ENTRY;
    char buf[MON_STRING_BUF_SIZE];

    // Examine the list of processes for which we have received a
    // pipe hangup indication but have not received a child death
    // signal.
    hungupPidsLock_.lock();
    int pid;
    for (hungupPids_t::const_iterator it = hungupPids_.begin();
         it != hungupPids_.end();)
    {
        pid = *it;
        ++it;

        if (trace_settings & TRACE_PROCESS)
        {
            trace_printf("%s@%d process %d is in hangup list\n",
                         method_name, __LINE__, pid);
        }

        CProcess * process = GetProcess (pid);
        time_t hangupTime = 0;

        if (process)
        {
            hangupTime = process->GetHangupTime();
            if ( now < (hangupTime + PROCESS_DEATH_MARGIN) )
            {   // Process hangup detected recently.  Wait a while before
                // taking action on this process.  This allows time for
                // child death signal to arrive.

                // temp trace
                if (trace_settings & TRACE_PROCESS)
                {
                    trace_printf("%s@%d process %d not yet ripe\n",
                                 method_name, __LINE__, pid);
                }

                continue;
            }
        }

        // See if process is still alive
        if (kill(pid,0) == -1)
        {
            if (errno == ESRCH)
            {   // Process no longer exists
                if (trace_settings & TRACE_PROCESS)
                    trace_printf("%s@%d process %d no longer exists\n",
                                 method_name, __LINE__, pid);
                // Log info
                snprintf(buf, sizeof(buf),
                         "[%s], process %d no longer exists, initiating "
                         "exit processing\n", method_name, pid);
                mon_log_write(MON_PROCESS_PIDHANGUPCHECK_1, SQ_LOG_INFO, buf);

                // Remove from set
                hungupPids_.erase ( pid );

                // set state process
                // Queue request for processing by worker thread
                ReqQueue.enqueueChildDeathReq ( pid );

                // release buffers
                // todo
            }
            else
            {
                int saveerrno = errno;

                if (trace_settings & TRACE_PROCESS)
                    trace_printf("%s@%d process %d, errno=%d (%p)\n",
                                 method_name, __LINE__, pid, saveerrno,
                                 strerror(saveerrno));

                // Log info
                snprintf(buf, sizeof(buf),
                         "[%s], error getting process %d info, %s (%d)\n",
                        method_name, pid, strerror(saveerrno), saveerrno);
                mon_log_write(MON_PROCESS_PIDHANGUPCHECK_2, SQ_LOG_INFO, buf);
            }
        }
        else
        {
            char timestring[50];
            if (process)
            {
                strcpy(timestring, ctime ( &hangupTime ));
                timestring[strlen(timestring)-1] = '\0';
            }
            else
            {
                strcpy(timestring, "unknown");
            }

            if (trace_settings & TRACE_PROCESS)
                trace_printf("%s@%d process %d (%s) still running, no child "
                             "death indication received (hangup at %s)\n",
                             method_name, __LINE__, pid,
                             ((process != NULL) ? process->GetName(): "unknown"),
                             timestring);

            // Log info
            snprintf(buf, sizeof(buf),
                     "[%s], process %d (%s) still running, no child death "
                     "indication received (hangup at %s)\n", method_name, pid,
                     ((process != NULL) ? process->GetName() : "unknown"),
                     timestring);
            mon_log_write(MON_PROCESS_PIDHANGUPCHECK_3, SQ_LOG_INFO, buf);


            if (process)
                CheckFdState( process->FdStderr() );

            // Possibly kill process after sufficient time has elapsed
            // todo
        }
    }
    hungupPidsLock_.unlock();

    TRACE_EXIT;
}
#endif

void CProcessContainer::SetProcessState( CProcess *process, STATE state, bool abend, int downNode )
{
    const char method_name[] = "CProcessContainer::SetProcessState(process)";
    TRACE_ENTRY;

    if ( process )
    {
        switch ( state )
        {
        case State_Down:
            // Process intends to exits, when the child death arrives the
            // State_Stopped is processed
            if (trace_settings & TRACE_PROCESS)
                trace_printf( "%s@%d Setting State_Down for process %s(%d,%d:%d), abend=%d, down=%d\n"
                            , method_name, __LINE__
                            , process->GetName()
                            , process->GetNid()
                            , process->GetPid()
                            , process->GetVerifier()
                            , abend, downNode );
            process->SetState( State_Down );
            if ( abend && !process->IsAbended() )
            {
                process->SetAbended( abend );
            }
            break;

        case State_Stopped:
            if ( process->GetState() != State_Stopped )
            {
                // Process terminated so handle the exit processing.
                // Termination detected through a child death signal or
                // a broken stderr pipe for an attached process.

                // Note: Exit_Process() will delete the process object, so
                //       save the process information needed before the call
#ifndef NAMESERVER_PROCESS
                PROCESSTYPE processType = process->GetType();
#endif
                string      processName = process->GetName();
                int         processNid  = process->GetNid();
                int         processPid  = process->GetPid();
                Verifier_t  processVerifier = process->GetVerifier();
#ifndef NAMESERVER_PROCESS
                Exit_Process( process, abend, downNode );
#endif
                if (trace_settings & TRACE_PROCESS)
                    trace_printf( "%s@%d Set State_Stopped for process %s(%d,%d:%d), abend=%d, down=%d, "
                                  "killingMyNode=%d,DTM aborted=%d, SMS aborted=%d\n"
                                , method_name, __LINE__
                                , processName.c_str(), processNid, processPid, processVerifier
                                , abend, downNode
                                , MyNode->IsKillingNode(), MyNode->IsDTMAborted(), MyNode->IsSMSAborted());
#ifndef NAMESERVER_PROCESS
                if ( !MyNode->IsKillingNode() )
                {
                    switch ( processType )
                    {
                    case ProcessType_DTM:
                        if ( MyNode->GetState() != State_Shutdown &&
                             MyNode->IsDTMAborted() )
                        {
                            char buf[MON_STRING_BUF_SIZE];
                            snprintf(buf, sizeof(buf),
                                     "[%s], DTM (%s) aborted, Node %s going down\n",
                                     method_name, processName.c_str(), MyNode->GetName());
                            mon_log_write(MON_PROCESS_SETSTATE_1, SQ_LOG_INFO, buf);

                            snprintf( buf, sizeof(buf),
                                      "DTM (%s) aborted, Node %s going down\n",
                                      processName.c_str(), MyNode->GetName());
                            genSnmpTrap( buf );

                            // DTM just died unexpectedly, so bring the node down
                            Monitor->HardNodeDown(MyPNID, true);
                        }
                        break;
                    case ProcessType_SMS:
                        if ( MyNode->GetState() != State_Shutdown &&
                             MyNode->IsSMSAborted() )
                        {
                            char buf[MON_STRING_BUF_SIZE];
                            snprintf(buf, sizeof(buf),
                                     "[%s], SMS (%s) aborted, Node %s going down\n",
                                     method_name, processName.c_str(), MyNode->GetName());
                            mon_log_write(MON_PROCESS_SETSTATE_2, SQ_LOG_INFO, buf);

                            snprintf( buf, sizeof(buf),
                                      "SMS (%s) aborted, Node %s going down\n",
                                      processName.c_str(), MyNode->GetName());
                            genSnmpTrap( buf );

                            // SMS just died unexpectedly, so bring the node down
                            Monitor->HardNodeDown(MyPNID, true);
                        }
                        break;
                    default: // no special handling
                        break;
                    }
                }
#endif
            }
            break;
        default:
            process->SetState( state );
            break;
        }
    }

    TRACE_EXIT;
}



#ifndef NAMESERVER_PROCESS
bool CProcessContainer::WhoEnlisted( _TM_Txid_External trans_id, struct message_def *msg )
{
    int idx;
    CProcess *process = head_;
    CNotice *notice;

    const char method_name[] = "CProcessContainer::WhoEnlisted";
    TRACE_ENTRY;
    while ((process) &&
           (msg->u.reply.u.trans_info.num_processes < MAX_PROC_LIST ))
    {
        notice = process->GetNoticeHead();
        while (notice)
        {
            if ( isEqual( notice->TransID, trans_id ) )
            {
                idx = msg->u.reply.u.trans_info.num_processes;
                msg->u.reply.u.trans_info.procs[idx].nid = process->GetNid();
                msg->u.reply.u.trans_info.procs[idx].pid = process->GetPid();
                msg->u.reply.u.trans_info.procs[idx].trans_id = trans_id;
                msg->u.reply.u.trans_info.num_processes++;
                if (msg->u.reply.u.trans_info.num_processes >= MAX_PROC_LIST)
                {
                    msg->u.reply.u.trans_info.return_code = MPI_ERR_TRUNCATE;
                    return FAILURE;
                }
                break;
            }
            notice = notice->GetNext();
        }
        process = process->GetNext();
    }

    TRACE_EXIT;
    return SUCCESS;
}
#endif
