| |
| <!DOCTYPE HTML> |
| <html lang="" > |
| <head> |
| <meta charset="UTF-8"> |
| <meta content="text/html; charset=utf-8" http-equiv="Content-Type"> |
| <title>High Availability and Failover ยท ActiveMQ Artemis Documentation</title> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge" /> |
| <meta name="description" content=""> |
| <meta name="generator" content="GitBook 3.2.3"> |
| |
| |
| |
| |
| <link rel="stylesheet" href="gitbook/style.css"> |
| |
| |
| |
| |
| <link rel="stylesheet" href="gitbook/gitbook-plugin-highlight/website.css"> |
| |
| |
| |
| <link rel="stylesheet" href="gitbook/gitbook-plugin-search/search.css"> |
| |
| |
| |
| <link rel="stylesheet" href="gitbook/gitbook-plugin-fontsettings/website.css"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <meta name="HandheldFriendly" content="true"/> |
| <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no"> |
| <meta name="apple-mobile-web-app-capable" content="yes"> |
| <meta name="apple-mobile-web-app-status-bar-style" content="black"> |
| <link rel="apple-touch-icon-precomposed" sizes="152x152" href="gitbook/images/apple-touch-icon-precomposed-152.png"> |
| <link rel="shortcut icon" href="gitbook/images/favicon.ico" type="image/x-icon"> |
| |
| |
| <link rel="next" href="graceful-shutdown.html" /> |
| |
| |
| <link rel="prev" href="clusters.html" /> |
| |
| |
| </head> |
| <body> |
| |
| <div class="book"> |
| <div class="book-summary"> |
| |
| |
| <div id="book-search-input" role="search"> |
| <input type="text" placeholder="Type to search" /> |
| </div> |
| |
| |
| <nav role="navigation"> |
| |
| |
| |
| <ul class="summary"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class="chapter " data-level="1.1" data-path="./"> |
| |
| <a href="./"> |
| |
| |
| Introduction |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.2" data-path="notice.html"> |
| |
| <a href="notice.html"> |
| |
| |
| Legal Notice |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.3" data-path="preface.html"> |
| |
| <a href="preface.html"> |
| |
| |
| Preface |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.4" data-path="project-info.html"> |
| |
| <a href="project-info.html"> |
| |
| |
| Project Info |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.5" data-path="messaging-concepts.html"> |
| |
| <a href="messaging-concepts.html"> |
| |
| |
| Messaging Concepts |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.6" data-path="architecture.html"> |
| |
| <a href="architecture.html"> |
| |
| |
| Architecture |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.7" data-path="using-server.html"> |
| |
| <a href="using-server.html"> |
| |
| |
| Using the Server |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.8" data-path="address-model.html"> |
| |
| <a href="address-model.html"> |
| |
| |
| Address Model |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.9" data-path="using-jms.html"> |
| |
| <a href="using-jms.html"> |
| |
| |
| Using JMS |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.10" data-path="using-core.html"> |
| |
| <a href="using-core.html"> |
| |
| |
| Using Core |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.11" data-path="using-AMQP.html"> |
| |
| <a href="using-AMQP.html"> |
| |
| |
| Using AMQP |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.12" data-path="jms-core-mapping.html"> |
| |
| <a href="jms-core-mapping.html"> |
| |
| |
| Mapping JMS Concepts to the Core API |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.13" data-path="client-classpath.html"> |
| |
| <a href="client-classpath.html"> |
| |
| |
| The Client Classpath |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.14" data-path="examples.html"> |
| |
| <a href="examples.html"> |
| |
| |
| Examples |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.15" data-path="wildcard-routing.html"> |
| |
| <a href="wildcard-routing.html"> |
| |
| |
| Routing Messages With Wild Cards |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.16" data-path="wildcard-syntax.html"> |
| |
| <a href="wildcard-syntax.html"> |
| |
| |
| Understanding the Apache ActiveMQ Artemis Wildcard Syntax |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.17" data-path="filter-expressions.html"> |
| |
| <a href="filter-expressions.html"> |
| |
| |
| Filter Expressions |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.18" data-path="persistence.html"> |
| |
| <a href="persistence.html"> |
| |
| |
| Persistence |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.19" data-path="configuring-transports.html"> |
| |
| <a href="configuring-transports.html"> |
| |
| |
| Configuring Transports |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.20" data-path="config-reload.html"> |
| |
| <a href="config-reload.html"> |
| |
| |
| Configuration Reload |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.21" data-path="connection-ttl.html"> |
| |
| <a href="connection-ttl.html"> |
| |
| |
| Detecting Dead Connections |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.22" data-path="slow-consumers.html"> |
| |
| <a href="slow-consumers.html"> |
| |
| |
| Detecting Slow Consumers |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.23" data-path="network-isolation.html"> |
| |
| <a href="network-isolation.html"> |
| |
| |
| Avoiding Network Isolation |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.24" data-path="critical-analysis.html"> |
| |
| <a href="critical-analysis.html"> |
| |
| |
| Detecting Broker Issues (Critical Analysis) |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.25" data-path="transaction-config.html"> |
| |
| <a href="transaction-config.html"> |
| |
| |
| Resource Manager Configuration |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.26" data-path="flow-control.html"> |
| |
| <a href="flow-control.html"> |
| |
| |
| Flow Control |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.27" data-path="send-guarantees.html"> |
| |
| <a href="send-guarantees.html"> |
| |
| |
| Guarantees of sends and commits |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.28" data-path="undelivered-messages.html"> |
| |
| <a href="undelivered-messages.html"> |
| |
| |
| Message Redelivery and Undelivered Messages |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.29" data-path="message-expiry.html"> |
| |
| <a href="message-expiry.html"> |
| |
| |
| Message Expiry |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.30" data-path="large-messages.html"> |
| |
| <a href="large-messages.html"> |
| |
| |
| Large Messages |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.31" data-path="paging.html"> |
| |
| <a href="paging.html"> |
| |
| |
| Paging |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.32" data-path="queue-attributes.html"> |
| |
| <a href="queue-attributes.html"> |
| |
| |
| Queue Attributes |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.33" data-path="scheduled-messages.html"> |
| |
| <a href="scheduled-messages.html"> |
| |
| |
| Scheduled Messages |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.34" data-path="last-value-queues.html"> |
| |
| <a href="last-value-queues.html"> |
| |
| |
| Last-Value Queues |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.35" data-path="message-grouping.html"> |
| |
| <a href="message-grouping.html"> |
| |
| |
| Message Grouping |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.36" data-path="pre-acknowledge.html"> |
| |
| <a href="pre-acknowledge.html"> |
| |
| |
| Extra Acknowledge Modes |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.37" data-path="management.html"> |
| |
| <a href="management.html"> |
| |
| |
| Management |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.38" data-path="management-console.html"> |
| |
| <a href="management-console.html"> |
| |
| |
| Management Console |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.39" data-path="security.html"> |
| |
| <a href="security.html"> |
| |
| |
| Security |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.40" data-path="masking-passwords.html"> |
| |
| <a href="masking-passwords.html"> |
| |
| |
| Masking Passwords |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.41" data-path="broker-plugins.html"> |
| |
| <a href="broker-plugins.html"> |
| |
| |
| Broker Plugins |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.42" data-path="resource-limits.html"> |
| |
| <a href="resource-limits.html"> |
| |
| |
| Resource Limits |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.43" data-path="jms-bridge.html"> |
| |
| <a href="jms-bridge.html"> |
| |
| |
| The JMS Bridge |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.44" data-path="client-reconnection.html"> |
| |
| <a href="client-reconnection.html"> |
| |
| |
| Client Reconnection and Session Reattachment |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.45" data-path="diverts.html"> |
| |
| <a href="diverts.html"> |
| |
| |
| Diverting and Splitting Message Flows |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.46" data-path="core-bridges.html"> |
| |
| <a href="core-bridges.html"> |
| |
| |
| Core Bridges |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.47" data-path="duplicate-detection.html"> |
| |
| <a href="duplicate-detection.html"> |
| |
| |
| Duplicate Message Detection |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.48" data-path="clusters.html"> |
| |
| <a href="clusters.html"> |
| |
| |
| Clusters |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter active" data-level="1.49" data-path="ha.html"> |
| |
| <a href="ha.html"> |
| |
| |
| High Availability and Failover |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.50" data-path="graceful-shutdown.html"> |
| |
| <a href="graceful-shutdown.html"> |
| |
| |
| Graceful Server Shutdown |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.51" data-path="libaio.html"> |
| |
| <a href="libaio.html"> |
| |
| |
| Libaio Native Libraries |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.52" data-path="thread-pooling.html"> |
| |
| <a href="thread-pooling.html"> |
| |
| |
| Thread management |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.53" data-path="logging.html"> |
| |
| <a href="logging.html"> |
| |
| |
| Logging |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.54" data-path="rest.html"> |
| |
| <a href="rest.html"> |
| |
| |
| REST Interface |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.55" data-path="embedding-activemq.html"> |
| |
| <a href="embedding-activemq.html"> |
| |
| |
| Embedding Apache ActiveMQ Artemis |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.56" data-path="karaf.html"> |
| |
| <a href="karaf.html"> |
| |
| |
| Apache Karaf |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.57" data-path="spring-integration.html"> |
| |
| <a href="spring-integration.html"> |
| |
| |
| Spring Integration |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.58" data-path="cdi-integration.html"> |
| |
| <a href="cdi-integration.html"> |
| |
| |
| CDI Integration |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.59" data-path="intercepting-operations.html"> |
| |
| <a href="intercepting-operations.html"> |
| |
| |
| Intercepting Operations |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.60" data-path="protocols-interoperability.html"> |
| |
| <a href="protocols-interoperability.html"> |
| |
| |
| Protocols and Interoperability |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.61" data-path="tools.html"> |
| |
| <a href="tools.html"> |
| |
| |
| Tools |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.62" data-path="maven-plugin.html"> |
| |
| <a href="maven-plugin.html"> |
| |
| |
| Maven Plugin |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.63" data-path="unit-testing.html"> |
| |
| <a href="unit-testing.html"> |
| |
| |
| Unit Testing |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.64" data-path="perf-tuning.html"> |
| |
| <a href="perf-tuning.html"> |
| |
| |
| Troubleshooting and Performance Tuning |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.65" data-path="configuration-index.html"> |
| |
| <a href="configuration-index.html"> |
| |
| |
| Configuration Reference |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| <li class="chapter " data-level="1.66" data-path="updating-artemis.html"> |
| |
| <a href="updating-artemis.html"> |
| |
| |
| Updating Artemis |
| |
| </a> |
| |
| |
| |
| </li> |
| |
| |
| |
| |
| <li class="divider"></li> |
| |
| <li> |
| <a href="https://www.gitbook.com" target="blank" class="gitbook-link"> |
| Published with GitBook |
| </a> |
| </li> |
| </ul> |
| |
| |
| </nav> |
| |
| |
| </div> |
| |
| <div class="book-body"> |
| |
| <div class="body-inner"> |
| |
| |
| |
| <div class="book-header" role="navigation"> |
| |
| |
| <!-- Title --> |
| <h1> |
| <i class="fa fa-circle-o-notch fa-spin"></i> |
| <a href="." >High Availability and Failover</a> |
| </h1> |
| </div> |
| |
| |
| |
| |
| <div class="page-wrapper" tabindex="-1" role="main"> |
| <div class="page-inner"> |
| |
| <div id="book-search-results"> |
| <div class="search-noresults"> |
| |
| <section class="normal markdown-section"> |
| |
| <h1 id="high-availability-and-failover">High Availability and Failover</h1> |
| <p>We define high availability as the <em>ability for the system to continue |
| functioning after failure of one or more of the servers</em>.</p> |
| <p>A part of high availability is <em>failover</em> which we define as the |
| <em>ability for client connections to migrate from one server to another in |
| event of server failure so client applications can continue to operate</em>.</p> |
| <h2 id="live---backup-groups">Live - Backup Groups</h2> |
| <p>Apache ActiveMQ Artemis allows servers to be linked together as <em>live - backup</em> groups |
| where each live server can have 1 or more backup servers. A backup |
| server is owned by only one live server. Backup servers are not |
| operational until failover occurs, however 1 chosen backup, which will |
| be in passive mode, announces its status and waits to take over the live |
| servers work</p> |
| <p>Before failover, only the live server is serving the Apache ActiveMQ Artemis clients |
| while the backup servers remain passive or awaiting to become a backup |
| server. When a live server crashes or is brought down in the correct |
| mode, the backup server currently in passive mode will become live and |
| another backup server will become passive. If a live server restarts |
| after a failover then it will have priority and be the next server to |
| become live when the current live server goes down, if the current live |
| server is configured to allow automatic failback then it will detect the |
| live server coming back up and automatically stop.</p> |
| <h3 id="ha-policies">HA Policies</h3> |
| <p>Apache ActiveMQ Artemis supports two different strategies for backing up a server |
| <em>shared store</em> and <em>replication</em>. Which is configured via the |
| <code>ha-policy</code> configuration element.</p> |
| <pre><code><ha-policy> |
| <replication/> |
| </ha-policy> |
| </code></pre><p>or</p> |
| <pre><code><ha-policy> |
| <shared-store/> |
| </ha-policy> |
| </code></pre><p>As well as these 2 strategies there is also a 3rd called <code>live-only</code>. |
| This of course means there will be no Backup Strategy and is the default |
| if none is provided, however this is used to configure <code>scale-down</code> |
| which we will cover in a later chapter.</p> |
| <blockquote> |
| <p><strong>Note</strong></p> |
| <p>The <code>ha-policy</code> configurations replaces any current HA configuration |
| in the root of the <code>broker.xml</code> configuration. All old |
| configuration is now deprecated although best efforts will be made to |
| honour it if configured this way.</p> |
| <p><strong>Note</strong></p> |
| <p>Only persistent message data will survive failover. Any non persistent |
| message data will not be available after failover.</p> |
| </blockquote> |
| <p>The <code>ha-policy</code> type configures which strategy a cluster should use to |
| provide the backing up of a servers data. Within this configuration |
| element is configured how a server should behave within the cluster, |
| either as a master (live), slave (backup) or colocated (both live and |
| backup). This would look something like:</p> |
| <pre><code><ha-policy> |
| <replication> |
| <master/> |
| </replication> |
| </ha-policy> |
| </code></pre><p>or</p> |
| <pre><code><ha-policy> |
| <shared-store/> |
| <slave/> |
| </shared-store/> |
| </ha-policy> |
| </code></pre><p>or</p> |
| <pre><code><ha-policy> |
| <replication> |
| <colocated/> |
| </replication> |
| </ha-policy> |
| </code></pre><h3 id="data-replication">Data Replication</h3> |
| <p>When using replication, the live and the backup servers do not share the |
| same data directories, all data synchronization is done over the |
| network. Therefore all (persistent) data received by the live server |
| will be duplicated to the backup.</p> |
| <p>Notice that upon start-up the backup server will first need to |
| synchronize all existing data from the live server before becoming |
| capable of replacing the live server should it fail. So unlike when |
| using shared storage, a replicating backup will not be a fully |
| operational backup right after start-up, but only after it finishes |
| synchronizing the data with its live server. The time it will take for |
| this to happen will depend on the amount of data to be synchronized and |
| the connection speed.</p> |
| <blockquote> |
| <p><strong>Note</strong></p> |
| <p>In general, synchronization occurs in parallel with current network traffic so |
| this won't cause any blocking on current clients. However, there is a critical |
| moment at the end of this process where the replicating server must complete |
| the synchronization and ensure the replica acknowledges this completion. This |
| exchange between the replicating server and replica will block any journal |
| related operations. The maximum length of time that this exchange will block |
| is controlled by the <code>initial-replication-sync-timeout</code> configuration element.</p> |
| </blockquote> |
| <p>Replication will create a copy of the data at the backup. One issue to |
| be aware of is: in case of a successful fail-over, the backup's data |
| will be newer than the one at the live's storage. If you configure your |
| live server to perform a failback to live server when restarted, it will synchronize its data |
| with the backup's. If both servers are shutdown, the administrator will |
| have to determine which one has the latest data.</p> |
| <p>The replicating live and backup pair must be part of a cluster. The |
| Cluster Connection also defines how backup servers will find the remote |
| live servers to pair with. Refer to <a href="clusters.html">Clusters</a> for details on how this is done, |
| and how to configure a cluster connection. Notice that:</p> |
| <ul> |
| <li><p>Both live and backup servers must be part of the same cluster. |
| Notice that even a simple live/backup replicating pair will require |
| a cluster configuration.</p> |
| </li> |
| <li><p>Their cluster user and password must match.</p> |
| </li> |
| </ul> |
| <p>Within a cluster, there are two ways that a backup server will locate a |
| live server to replicate from, these are:</p> |
| <ul> |
| <li><p><code>specifying a node group</code>. You can specify a group of live servers |
| that a backup server can connect to. This is done by configuring |
| <code>group-name</code> in either the <code>master</code> or the <code>slave</code> element of the |
| <code>broker.xml</code>. A Backup server will only connect to a |
| live server that shares the same node group name</p> |
| </li> |
| <li><p><code>connecting to any live</code>. This will be the behaviour if <code>group-name</code> |
| is not configured allowing a backup server to connect to any live |
| server</p> |
| </li> |
| </ul> |
| <blockquote> |
| <p><strong>Note</strong></p> |
| <p>A <code>group-name</code> example: suppose you have 5 live servers and 6 backup |
| servers:</p> |
| <ul> |
| <li><p><code>live1</code>, <code>live2</code>, <code>live3</code>: with <code>group-name=fish</code></p> |
| </li> |
| <li><p><code>live4</code>, <code>live5</code>: with <code>group-name=bird</code></p> |
| </li> |
| <li><p><code>backup1</code>, <code>backup2</code>, <code>backup3</code>, <code>backup4</code>: with <code>group-name=fish</code></p> |
| </li> |
| <li><p><code>backup5</code>, <code>backup6</code>: with <code>group-name=bird</code></p> |
| </li> |
| </ul> |
| <p>After joining the cluster the backups with <code>group-name=fish</code> will |
| search for live servers with <code>group-name=fish</code> to pair with. Since |
| there is one backup too many, the <code>fish</code> will remain with one spare |
| backup.</p> |
| <p>The 2 backups with <code>group-name=bird</code> (<code>backup5</code> and <code>backup6</code>) will |
| pair with live servers <code>live4</code> and <code>live5</code>.</p> |
| </blockquote> |
| <p>The backup will search for any live server that it is configured to |
| connect to. It then tries to replicate with each live server in turn |
| until it finds a live server that has no current backup configured. If |
| no live server is available it will wait until the cluster topology |
| changes and repeats the process.</p> |
| <blockquote> |
| <p><strong>Note</strong></p> |
| <p>This is an important distinction from a shared-store backup, if a |
| backup starts and does not find a live server, the server will just |
| activate and start to serve client requests. In the replication case, |
| the backup just keeps waiting for a live server to pair with. Note |
| that in replication the backup server does not know whether any data |
| it might have is up to date, so it really cannot decide to activate |
| automatically. To activate a replicating backup server using the data |
| it has, the administrator must change its configuration to make it a |
| live server by changing <code>slave</code> to <code>master</code>.</p> |
| </blockquote> |
| <p>Much like in the shared-store case, when the live server stops or |
| crashes, its replicating backup will become active and take over its |
| duties. Specifically, the backup will become active when it loses |
| connection to its live server. This can be problematic because this can |
| also happen because of a temporary network problem. In order to address |
| this issue, the backup will try to determine whether it still can |
| connect to the other servers in the cluster. If it can connect to more |
| than half the servers, it will become active, if more than half the |
| servers also disappeared with the live, the backup will wait and try |
| reconnecting with the live. This avoids a split brain situation.</p> |
| <h4 id="configuration">Configuration</h4> |
| <p>To configure the live and backup servers to be a replicating pair, |
| configure the live server in ' <code>broker.xml</code> to have:</p> |
| <pre><code><ha-policy> |
| <replication> |
| <master/> |
| </replication> |
| </ha-policy> |
| . |
| <cluster-connections> |
| <cluster-connection name="my-cluster"> |
| ... |
| </cluster-connection> |
| </cluster-connections> |
| </code></pre><p>The backup server must be similarly configured but as a <code>slave</code></p> |
| <pre><code><ha-policy> |
| <replication> |
| <slave/> |
| </replication> |
| </ha-policy> |
| </code></pre><h4 id="all-replication-configuration">All Replication Configuration</h4> |
| <p>The following table lists all the <code>ha-policy</code> configuration elements for |
| HA strategy Replication for <code>master</code>:</p> |
| <table summary="HA Replication Master Policy" border="1"> |
| <colgroup> |
| <col> |
| <col> |
| </colgroup> |
| <thead> |
| <tr> |
| <th>Name</th> |
| <th>Description</th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <td>`check-for-live-server`</td> |
| <td>Whether to check the cluster for a (live) server using our own server ID |
| when starting up. This option is only necessary for performing 'fail-back' |
| on replicating servers.</td> |
| </tr> |
| <tr> |
| <td>`cluster-name`</td> |
| <td>Name of the cluster configuration to use for replication. This setting is |
| only necessary if you configure multiple cluster connections. If configured then |
| the connector configuration of the cluster configuration with this name will be |
| used when connecting to the cluster to discover if a live server is already running, |
| see `check-for-live-server`. If unset then the default cluster connections configuration |
| is used (the first one configured).</td> |
| </tr> |
| <tr> |
| <td>`group-name`</td> |
| <td>If set, backup servers will only pair with live servers with matching group-name.</td> |
| </tr> |
| <tr> |
| <td>`initial-replication-sync-timeout`</td> |
| <td>The amount of time the replicating server will wait at the completion of the initial |
| replication process for the replica to acknowledge it has received all the necessary |
| data. The default is 30,000 milliseconds. <strong>Note</strong>: during this interval any |
| journal related operations will be blocked.</td> |
| </tr> |
| </tbody> |
| </table> |
| |
| <p>The following table lists all the <code>ha-policy</code> configuration elements for |
| HA strategy Replication for <code>slave</code>:</p> |
| <table summary="HA Replication Slave Policy" border="1"> |
| <colgroup> |
| <col> |
| <col> |
| </colgroup> |
| <thead> |
| <tr> |
| <th>Name</th> |
| <th>Description</th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <td>`cluster-name`</td> |
| <td>Name of the cluster configuration to use for replication. |
| This setting is only necessary if you configure multiple cluster |
| connections. If configured then the connector configuration of |
| the cluster configuration with this name will be used when |
| connecting to the cluster to discover if a live server is already |
| running, see `check-for-live-server`. If unset then the default |
| cluster connections configuration is used (the first one configured)</td> |
| </tr> |
| <tr> |
| <td>`group-name`</td> |
| <td>If set, backup servers will only pair with live servers with matching group-name</td> |
| </tr> |
| <tr> |
| <td>`max-saved-replicated-journals-size`</td> |
| <td>This specifies how many times a replicated backup server |
| can restart after moving its files on start. Once there are |
| this number of backup journal files the server will stop permanently |
| after if fails back.</td> |
| </tr> |
| <tr> |
| <td>`allow-failback`</td> |
| <td>Whether a server will automatically stop when a another places a |
| request to take over its place. The use case is when the backup has |
| failed over</td> |
| </tr> |
| <tr> |
| <td>`initial-replication-sync-timeout`</td> |
| <td>After failover and the slave has become live, this is |
| set on the new live server. It represents the amount of time |
| the replicating server will wait at the completion of the |
| initial replication process for the replica to acknowledge |
| it has received all the necessary data. The default is |
| 30,000 milliseconds. <strong>Note</strong>: during this interval any |
| journal related operations will be blocked.</td> |
| </tr> |
| </tbody> |
| </table> |
| |
| <h3 id="shared-store">Shared Store</h3> |
| <p>When using a shared store, both live and backup servers share the <em>same</em> |
| entire data directory using a shared file system. This means the paging |
| directory, journal directory, large messages and binding journal.</p> |
| <p>When failover occurs and a backup server takes over, it will load the |
| persistent storage from the shared file system and clients can connect |
| to it.</p> |
| <p>This style of high availability differs from data replication in that it |
| requires a shared file system which is accessible by both the live and |
| backup nodes. Typically this will be some kind of high performance |
| Storage Area Network (SAN). We do not recommend you use Network Attached |
| Storage (NAS), e.g. NFS mounts to store any shared journal (NFS is |
| slow).</p> |
| <p>The advantage of shared-store high availability is that no replication |
| occurs between the live and backup nodes, this means it does not suffer |
| any performance penalties due to the overhead of replication during |
| normal operation.</p> |
| <p>The disadvantage of shared store replication is that it requires a |
| shared file system, and when the backup server activates it needs to |
| load the journal from the shared store which can take some time |
| depending on the amount of data in the store.</p> |
| <p>If you require the highest performance during normal operation, have |
| access to a fast SAN and live with a slightly slower failover (depending |
| on amount of data).</p> |
| <p><img src="images/ha-shared-store.png" alt="ActiveMQ Artemis ha-shared-store.png"></p> |
| <h4 id="configuration">Configuration</h4> |
| <p>To configure the live and backup servers to share their store, configure |
| id via the <code>ha-policy</code> configuration in <code>broker.xml</code>:</p> |
| <pre><code><ha-policy> |
| <shared-store> |
| <master/> |
| </shared-store> |
| </ha-policy> |
| . |
| <cluster-connections> |
| <cluster-connection name="my-cluster"> |
| ... |
| </cluster-connection> |
| </cluster-connections> |
| </code></pre><p>The backup server must also be configured as a backup.</p> |
| <pre><code><ha-policy> |
| <shared-store> |
| <slave/> |
| </shared-store> |
| </ha-policy> |
| </code></pre><p>In order for live - backup groups to operate properly with a shared |
| store, both servers must have configured the location of journal |
| directory to point to the <em>same shared location</em> (as explained in <a href="persistence.html">Configuring the message journal</a>)</p> |
| <blockquote> |
| <p><strong>Note</strong></p> |
| <p>todo write something about GFS</p> |
| </blockquote> |
| <p>Also each node, live and backups, will need to have a cluster connection |
| defined even if not part of a cluster. The Cluster Connection info |
| defines how backup servers announce there presence to its live server or |
| any other nodes in the cluster. Refer to <a href="clusters.html">Clusters</a> for details on how this is |
| done.</p> |
| <h3 id="failing-back-to-live-server">Failing Back to live Server</h3> |
| <p>After a live server has failed and a backup taken has taken over its |
| duties, you may want to restart the live server and have clients fail |
| back.</p> |
| <p>In case of "shared disk", simply restart the original live server and |
| kill the new live server by can do this by killing the process itself. |
| Alternatively you can set <code>allow-fail-back</code> to <code>true</code> on the slave |
| config which will force the backup that has become live to automatically |
| stop. This configuration would look like:</p> |
| <pre><code><ha-policy> |
| <shared-store> |
| <slave> |
| <allow-failback>true</allow-failback> |
| </slave> |
| </shared-store> |
| </ha-policy> |
| </code></pre><p>In replication HA mode you need to set an extra property |
| <code>check-for-live-server</code> to <code>true</code> in the <code>master</code> configuration. If set |
| to true, during start-up a live server will first search the cluster for |
| another server using its nodeID. If it finds one, it will contact this |
| server and try to "fail-back". Since this is a remote replication |
| scenario, the "starting live" will have to synchronize its data with the |
| server running with its ID, once they are in sync, it will request the |
| other server (which it assumes it is a back that has assumed its duties) |
| to shutdown for it to take over. This is necessary because otherwise the |
| live server has no means to know whether there was a fail-over or not, |
| and if there was if the server that took its duties is still running or |
| not. To configure this option at your <code>broker.xml</code> |
| configuration file as follows:</p> |
| <pre><code><ha-policy> |
| <replication> |
| <master> |
| <check-for-live-server>true</check-for-live-server> |
| <master> |
| </replication> |
| </ha-policy> |
| </code></pre><blockquote> |
| <p><strong>Warning</strong></p> |
| <p>Be aware that if you restart a live server while after failover has |
| occurred then <code>check-for-live-server</code> must be set to <code>true</code>. If not the live server |
| will restart and server the same messages that the backup has already |
| handled causing duplicates.</p> |
| </blockquote> |
| <p>It is also possible, in the case of shared store, to cause failover to |
| occur on normal server shutdown, to enable this set the following |
| property to true in the <code>ha-policy</code> configuration on either the <code>master</code> |
| or <code>slave</code> like so:</p> |
| <pre><code><ha-policy> |
| <shared-store> |
| <master> |
| <failover-on-shutdown>true</failover-on-shutdown> |
| </master> |
| </shared-store> |
| </ha-policy> |
| </code></pre><p>By default this is set to false, if by some chance you have set this to |
| false but still want to stop the server normally and cause failover then |
| you can do this by using the management API as explained at <a href="management.html">Management</a></p> |
| <p>You can also force the running live server to shutdown when the old live |
| server comes back up allowing the original live server to take over |
| automatically by setting the following property in the |
| <code>broker.xml</code> configuration file as follows:</p> |
| <pre><code><ha-policy> |
| <shared-store> |
| <slave> |
| <allow-failback>true</allow-failback> |
| </slave> |
| </shared-store> |
| </ha-policy> |
| </code></pre><h4 id="all-shared-store-configuration">All Shared Store Configuration</h4> |
| <p>The following table lists all the <code>ha-policy</code> configuration elements for |
| HA strategy shared store for <code>master</code>:</p> |
| <table summary="HA Shared Store Master Policy" border="1"> |
| <colgroup> |
| <col> |
| <col> |
| </colgroup> |
| <thead> |
| <tr> |
| <th>Name</th> |
| <th>Description</th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <td>`failover-on-server-shutdown`</td> |
| <td>If set to true then when this server is stopped |
| normally the backup will become live assuming failover. |
| If false then the backup server will remain passive. |
| Note that if false you want failover to occur the you |
| can use the the management API as explained at [Management](management.md)</td> |
| </tr> |
| <tr> |
| <td>`wait-for-activation`</td> |
| <td>If set to true then server startup will wait until it is activated. |
| If set to false then server startup will be done in the background. |
| Default is true.</td> |
| </tr> |
| </tbody> |
| </table> |
| |
| <p>The following table lists all the <code>ha-policy</code> configuration elements for |
| HA strategy Shared Store for <code>slave</code>:</p> |
| <table summary="HA Shared Store Slave Policy" border="1"> |
| <colgroup> |
| <col> |
| <col> |
| </colgroup> |
| <thead> |
| <tr> |
| <th>Name</th> |
| <th>Description</th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <td>`failover-on-server-shutdown`</td> |
| <td>In the case of a backup that has become live. then |
| when set to true then when this server is stopped normally |
| the backup will become liveassuming failover. If false then |
| the backup server will remain passive. Note that if false |
| you want failover to occur the you can use the the management |
| API as explained at [Management](management.md)</td> |
| </tr> |
| <tr> |
| <td>`allow-failback`</td> |
| <td>Whether a server will automatically stop when a another |
| places a request to take over its place. The use case is |
| when the backup has failed over.</td> |
| </tr> |
| </tbody> |
| </table> |
| |
| <h4 id="colocated-backup-servers">Colocated Backup Servers</h4> |
| <p>It is also possible when running standalone to colocate backup servers |
| in the same JVM as another live server. Live Servers can be configured |
| to request another live server in the cluster to start a backup server |
| in the same JVM either using shared store or replication. The new backup |
| server will inherit its configuration from the live server creating it |
| apart from its name, which will be set to <code>colocated_backup_n</code> where n |
| is the number of backups the server has created, and any directories and |
| its Connectors and Acceptors which are discussed later on in this |
| chapter. A live server can also be configured to allow requests from |
| backups and also how many backups a live server can start. this way you |
| can evenly distribute backups around the cluster. This is configured via |
| the <code>ha-policy</code> element in the <code>broker.xml</code> file like |
| so:</p> |
| <pre><code><ha-policy> |
| <replication> |
| <colocated> |
| <request-backup>true</request-backup> |
| <max-backups>1</max-backups> |
| <backup-request-retries>-1</backup-request-retries> |
| <backup-request-retry-interval>5000</backup-request-retry-interval> |
| <master/> |
| <slave/> |
| </colocated> |
| <replication> |
| </ha-policy> |
| </code></pre><p>the above example is configured to use replication, in this case the |
| <code>master</code> and <code>slave</code> configurations must match those for normal |
| replication as in the previous chapter. <code>shared-store</code> is also supported</p> |
| <p><img src="images/ha-colocated.png" alt="ActiveMQ Artemis ha-colocated.png"></p> |
| <h4 id="configuring-connectors-and-acceptors">Configuring Connectors and Acceptors</h4> |
| <p>If the HA Policy is colocated then connectors and acceptors will be |
| inherited from the live server creating it and offset depending on the |
| setting of <code>backup-port-offset</code> configuration element. If this is set to |
| say 100 (which is the default) and a connector is using port 61616 then |
| this will be set to 5545 for the first server created, 5645 for the |
| second and so on.</p> |
| <blockquote> |
| <p><strong>Note</strong></p> |
| <p>for INVM connectors and Acceptors the id will have |
| <code>colocated_backup_n</code> appended, where n is the backup server number.</p> |
| </blockquote> |
| <h4 id="remote-connectors">Remote Connectors</h4> |
| <p>It may be that some of the Connectors configured are for external |
| servers and hence should be excluded from the offset. for instance a |
| Connector used by the cluster connection to do quorum voting for a |
| replicated backup server, these can be omitted from being offset by |
| adding them to the <code>ha-policy</code> configuration like so:</p> |
| <pre><code><ha-policy> |
| <replication> |
| <colocated> |
| <excludes> |
| <connector-ref>remote-connector</connector-ref> |
| </excludes> |
| ......... |
| </ha-policy> |
| </code></pre><h4 id="configuring-directories">Configuring Directories</h4> |
| <p>Directories for the Journal, Large messages and Paging will be set |
| according to what the HA strategy is. If shared store the the requesting |
| server will notify the target server of which directories to use. If |
| replication is configured then directories will be inherited from the |
| creating server but have the new backups name appended.</p> |
| <p>The following table lists all the <code>ha-policy</code> configuration elements for colocated policy:</p> |
| <table summary="HA Replication Colocation Policy" border="1"> |
| <colgroup> |
| <col> |
| <col> |
| </colgroup> |
| <thead> |
| <tr> |
| <th>Name</th> |
| <th>Description</th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <td>`request-backup`</td> |
| <td>If true then the server will request a backup on another node</td> |
| </tr> |
| <tr> |
| <td>`backup-request-retries`</td> |
| <td>How many times the live server will try to request a backup, -1 means for ever.</td> |
| </tr> |
| <tr> |
| <td>`backup-request-retry-interval`</td> |
| <td>How long to wait for retries between attempts to request a backup server.</td> |
| </tr> |
| <tr> |
| <td>`max-backups`</td> |
| <td>How many backups a live server can create</td> |
| </tr> |
| <tr> |
| <td>`backup-port-offset`</td> |
| <td>The offset to use for the Connectors and Acceptors when creating a new backup server.</td> |
| </tr> |
| </tbody> |
| </table> |
| |
| <h3 id="scaling-down">Scaling Down</h3> |
| <p>An alternative to using Live/Backup groups is to configure scaledown. |
| when configured for scale down a server can copy all its messages and |
| transaction state to another live server. The advantage of this is that |
| you dont need full backups to provide some form of HA, however there are |
| disadvantages with this approach the first being that it only deals with |
| a server being stopped and not a server crash. The caveat here is if you |
| configure a backup to scale down.</p> |
| <p>Another disadvantage is that it is possible to lose message ordering. |
| This happens in the following scenario, say you have 2 live servers and |
| messages are distributed evenly between the servers from a single |
| producer, if one of the servers scales down then the messages sent back |
| to the other server will be in the queue after the ones already there, |
| so server 1 could have messages 1,3,5,7,9 and server 2 would have |
| 2,4,6,8,10, if server 2 scales down the order in server 1 would be |
| 1,3,5,7,9,2,4,6,8,10.</p> |
| <p><img src="images/ha-scaledown.png" alt="ActiveMQ Artemis ha-scaledown.png"></p> |
| <p>The configuration for a live server to scale down would be something |
| like:</p> |
| <pre><code><ha-policy> |
| <live-only> |
| <scale-down> |
| <connectors> |
| <connector-ref>server1-connector</connector-ref> |
| </connectors> |
| </scale-down> |
| </live-only> |
| </ha-policy> |
| </code></pre><p>In this instance the server is configured to use a specific connector to |
| scale down, if a connector is not specified then the first INVM |
| connector is chosen, this is to make scale down fromm a backup server |
| easy to configure. It is also possible to use discovery to scale down, |
| this would look like:</p> |
| <pre><code><ha-policy> |
| <live-only> |
| <scale-down> |
| <discovery-group-ref discovery-group-name="my-discovery-group"/> |
| </scale-down> |
| </live-only> |
| </ha-policy> |
| </code></pre><h4 id="scale-down-with-groups">Scale Down with groups</h4> |
| <p>It is also possible to configure servers to only scale down to servers |
| that belong in the same group. This is done by configuring the group |
| like so:</p> |
| <pre><code><ha-policy> |
| <live-only> |
| <scale-down> |
| ... |
| <group-name>my-group</group-name> |
| </scale-down> |
| </live-only> |
| </ha-policy> |
| </code></pre><p>In this scenario only servers that belong to the group <code>my-group</code> will |
| be scaled down to</p> |
| <h4 id="scale-down-and-backups">Scale Down and Backups</h4> |
| <p>It is also possible to mix scale down with HA via backup servers. If a |
| slave is configured to scale down then after failover has occurred, |
| instead of starting fully the backup server will immediately scale down |
| to another live server. The most appropriate configuration for this is |
| using the <code>colocated</code> approach. it means as you bring up live server |
| they will automatically be backed up by server and as live servers are |
| shutdown, there messages are made available on another live server. A |
| typical configuration would look like:</p> |
| <pre><code><ha-policy> |
| <replication> |
| <colocated> |
| <backup-request-retries>44</backup-request-retries> |
| <backup-request-retry-interval>33</backup-request-retry-interval> |
| <max-backups>3</max-backups> |
| <request-backup>false</request-backup> |
| <backup-port-offset>33</backup-port-offset> |
| <master> |
| <group-name>purple</group-name> |
| <check-for-live-server>true</check-for-live-server> |
| <cluster-name>abcdefg</cluster-name> |
| </master> |
| <slave> |
| <group-name>tiddles</group-name> |
| <max-saved-replicated-journals-size>22</max-saved-replicated-journals-size> |
| <cluster-name>33rrrrr</cluster-name> |
| <restart-backup>false</restart-backup> |
| <scale-down> |
| <!--a grouping of servers that can be scaled down to--> |
| <group-name>boo!</group-name> |
| <!--either a discovery group--> |
| <discovery-group-ref discovery-group-name="wahey"/> |
| </scale-down> |
| </slave> |
| </colocated> |
| </replication> |
| </ha-policy> |
| </code></pre><h4 id="scale-down-and-clients">Scale Down and Clients</h4> |
| <p>When a server is stopping and preparing to scale down it will send a |
| message to all its clients informing them which server it is scaling |
| down to before disconnecting them. At this point the client will |
| reconnect however this will only succeed once the server has completed |
| scaledown. This is to ensure that any state such as queues or |
| transactions are there for the client when it reconnects. The normal |
| reconnect settings apply when the client is reconnecting so these should |
| be high enough to deal with the time needed to scale down.</p> |
| <h2 id="failover-modes">Failover Modes</h2> |
| <p>Apache ActiveMQ Artemis defines two types of client failover:</p> |
| <ul> |
| <li><p>Automatic client failover</p> |
| </li> |
| <li><p>Application-level client failover</p> |
| </li> |
| </ul> |
| <p>Apache ActiveMQ Artemis also provides 100% transparent automatic reattachment of |
| connections to the same server (e.g. in case of transient network |
| problems). This is similar to failover, except it is reconnecting to the |
| same server and is discussed in <a href="client-reconnection.html">Client Reconnection and Session Reattachment</a></p> |
| <p>During failover, if the client has consumers on any non persistent or |
| temporary queues, those queues will be automatically recreated during |
| failover on the backup node, since the backup node will not have any |
| knowledge of non persistent queues.</p> |
| <h3 id="automatic-client-failover">Automatic Client Failover</h3> |
| <p>Apache ActiveMQ Artemis clients can be configured to receive knowledge of all live and |
| backup servers, so that in event of connection failure at the client - |
| live server connection, the client will detect this and reconnect to the |
| backup server. The backup server will then automatically recreate any |
| sessions and consumers that existed on each connection before failover, |
| thus saving the user from having to hand-code manual reconnection logic.</p> |
| <p>Apache ActiveMQ Artemis clients detect connection failure when it has not received |
| packets from the server within the time given by |
| <code>client-failure-check-period</code> as explained in section <a href="connection-ttl.html">Detecting Dead Connections</a>. If the client |
| does not receive data in good time, it will assume the connection has |
| failed and attempt failover. Also if the socket is closed by the OS, |
| usually if the server process is killed rather than the machine itself |
| crashing, then the client will failover straight away.</p> |
| <p>Apache ActiveMQ Artemis clients can be configured to discover the list of live-backup |
| server groups in a number of different ways. They can be configured |
| explicitly or probably the most common way of doing this is to use |
| <em>server discovery</em> for the client to automatically discover the list. |
| For full details on how to configure server discovery, please see <a href="clusters.html">Clusters</a>. |
| Alternatively, the clients can explicitly connect to a specific server |
| and download the current servers and backups see <a href="clusters.html">Clusters</a>.</p> |
| <p>To enable automatic client failover, the client must be configured to |
| allow non-zero reconnection attempts (as explained in <a href="client-reconnection.html">Client Reconnection and Session Reattachment</a>).</p> |
| <p>By default failover will only occur after at least one connection has |
| been made to the live server. In other words, by default, failover will |
| not occur if the client fails to make an initial connection to the live |
| server - in this case it will simply retry connecting to the live server |
| according to the reconnect-attempts property and fail after this number |
| of attempts.</p> |
| <h4 id="failing-over-on-the-initial-connection">Failing over on the Initial Connection</h4> |
| <p>Since the client does not learn about the full topology until after the |
| first connection is made there is a window where it does not know about |
| the backup. If a failure happens at this point the client can only try |
| reconnecting to the original live server. To configure how many attempts |
| the client will make you can set the URL parameter <code>initialConnectAttempts</code>. |
| The default for this is <code>0</code>, that is try only once. Once the number of |
| attempts has been made an exception will be thrown.</p> |
| <p>For examples of automatic failover with transacted and non-transacted |
| JMS sessions, please see <a href="examples.html">the examples</a> chapter.</p> |
| <h4 id="a-note-on-server-replication">A Note on Server Replication</h4> |
| <p>Apache ActiveMQ Artemis does not replicate full server state between live and backup |
| servers. When the new session is automatically recreated on the backup |
| it won't have any knowledge of messages already sent or acknowledged in |
| that session. Any in-flight sends or acknowledgements at the time of |
| failover might also be lost.</p> |
| <p>By replicating full server state, theoretically we could provide a 100% |
| transparent seamless failover, which would avoid any lost messages or |
| acknowledgements, however this comes at a great cost: replicating the |
| full server state (including the queues, session, etc.). This would |
| require replication of the entire server state machine; every operation |
| on the live server would have to replicated on the replica server(s) in |
| the exact same global order to ensure a consistent replica state. This |
| is extremely hard to do in a performant and scalable way, especially |
| when one considers that multiple threads are changing the live server |
| state concurrently.</p> |
| <p>It is possible to provide full state machine replication using |
| techniques such as <em>virtual synchrony</em>, but this does not scale well and |
| effectively serializes all operations to a single thread, dramatically |
| reducing concurrency.</p> |
| <p>Other techniques for multi-threaded active replication exist such as |
| replicating lock states or replicating thread scheduling but this is |
| very hard to achieve at a Java level.</p> |
| <p>Consequently it has decided it was not worth massively reducing |
| performance and concurrency for the sake of 100% transparent failover. |
| Even without 100% transparent failover, it is simple to guarantee <em>once |
| and only once</em> delivery, even in the case of failure, by using a |
| combination of duplicate detection and retrying of transactions. However |
| this is not 100% transparent to the client code.</p> |
| <h4 id="handling-blocking-calls-during-failover">Handling Blocking Calls During Failover</h4> |
| <p>If the client code is in a blocking call to the server, waiting for a |
| response to continue its execution, when failover occurs, the new |
| session will not have any knowledge of the call that was in progress. |
| This call might otherwise hang for ever, waiting for a response that |
| will never come.</p> |
| <p>To prevent this, Apache ActiveMQ Artemis will unblock any blocking calls that were in |
| progress at the time of failover by making them throw a |
| <code>javax.jms.JMSException</code> (if using JMS), or a <code>ActiveMQException</code> with |
| error code <code>ActiveMQException.UNBLOCKED</code>. It is up to the client code to |
| catch this exception and retry any operations if desired.</p> |
| <p>If the method being unblocked is a call to commit(), or prepare(), then |
| the transaction will be automatically rolled back and Apache ActiveMQ Artemis will |
| throw a <code>javax.jms.TransactionRolledBackException</code> (if using JMS), or a |
| <code>ActiveMQException</code> with error code |
| <code>ActiveMQException.TRANSACTION_ROLLED_BACK</code> if using the core API.</p> |
| <h4 id="handling-failover-with-transactions">Handling Failover With Transactions</h4> |
| <p>If the session is transactional and messages have already been sent or |
| acknowledged in the current transaction, then the server cannot be sure |
| that messages sent or acknowledgements have not been lost during the |
| failover.</p> |
| <p>Consequently the transaction will be marked as rollback-only, and any |
| subsequent attempt to commit it will throw a |
| <code>javax.jms.TransactionRolledBackException</code> (if using JMS), or a |
| <code>ActiveMQException</code> with error code |
| <code>ActiveMQException.TRANSACTION_ROLLED_BACK</code> if using the core API.</p> |
| <blockquote> |
| <p><strong>Warning</strong></p> |
| <p>The caveat to this rule is when XA is used either via JMS or through |
| the core API. If 2 phase commit is used and prepare has already been |
| called then rolling back could cause a <code>HeuristicMixedException</code>. |
| Because of this the commit will throw a <code>XAException.XA_RETRY</code> |
| exception. This informs the Transaction Manager that it should retry |
| the commit at some later point in time, a side effect of this is that |
| any non persistent messages will be lost. To avoid this use persistent |
| messages when using XA. With acknowledgements this is not an issue |
| since they are flushed to the server before prepare gets called.</p> |
| </blockquote> |
| <p>It is up to the user to catch the exception, and perform any client side |
| local rollback code as necessary. There is no need to manually rollback |
| the session - it is already rolled back. The user can then just retry |
| the transactional operations again on the same session.</p> |
| <p>Apache ActiveMQ Artemis ships with a fully functioning example demonstrating how to do |
| this, please see <a href="examples.html">the examples</a> chapter.</p> |
| <p>If failover occurs when a commit call is being executed, the server, as |
| previously described, will unblock the call to prevent a hang, since no |
| response will come back. In this case it is not easy for the client to |
| determine whether the transaction commit was actually processed on the |
| live server before failure occurred.</p> |
| <blockquote> |
| <p><strong>Note</strong></p> |
| <p>If XA is being used either via JMS or through the core API then an |
| <code>XAException.XA_RETRY</code> is thrown. This is to inform Transaction |
| Managers that a retry should occur at some point. At some later point |
| in time the Transaction Manager will retry the commit. If the original |
| commit has not occurred then it will still exist and be committed, if |
| it does not exist then it is assumed to have been committed although |
| the transaction manager may log a warning.</p> |
| </blockquote> |
| <p>To remedy this, the client can simply enable duplicate detection (<a href="duplicate-detection.html">Duplicate Message Detection</a>) in |
| the transaction, and retry the transaction operations again after the |
| call is unblocked. If the transaction had indeed been committed on the |
| live server successfully before failover, then when the transaction is |
| retried, duplicate detection will ensure that any durable messages |
| resent in the transaction will be ignored on the server to prevent them |
| getting sent more than once.</p> |
| <blockquote> |
| <p><strong>Note</strong></p> |
| <p>By catching the rollback exceptions and retrying, catching unblocked |
| calls and enabling duplicate detection, once and only once delivery |
| guarantees for messages can be provided in the case of failure, |
| guaranteeing 100% no loss or duplication of messages.</p> |
| </blockquote> |
| <h4 id="handling-failover-with-non-transactional-sessions">Handling Failover With Non Transactional Sessions</h4> |
| <p>If the session is non transactional, messages or acknowledgements can be |
| lost in the event of failover.</p> |
| <p>If you wish to provide <em>once and only once</em> delivery guarantees for non |
| transacted sessions too, enabled duplicate detection, and catch unblock |
| exceptions as described in <a href="ha.html">Handling Blocking Calls During Failover</a></p> |
| <h3 id="getting-notified-of-connection-failure">Getting Notified of Connection Failure</h3> |
| <p>JMS provides a standard mechanism for getting notified asynchronously of |
| connection failure: <code>java.jms.ExceptionListener</code>. Please consult the JMS |
| javadoc or any good JMS tutorial for more information on how to use |
| this.</p> |
| <p>The Apache ActiveMQ Artemis core API also provides a similar feature in the form of the |
| class <code>org.apache.activemq.artemis.core.client.SessionFailureListener</code></p> |
| <p>Any ExceptionListener or SessionFailureListener instance will always be |
| called by ActiveMQ Artemis on event of connection failure, <strong>irrespective</strong> of |
| whether the connection was successfully failed over, reconnected or |
| reattached, however you can find out if reconnect or reattach has |
| happened by either the <code>failedOver</code> flag passed in on the |
| <code>connectionFailed</code> on <code>SessionfailureListener</code> or by inspecting the |
| error code on the <code>javax.jms.JMSException</code> which will be one of the |
| following:</p> |
| <p>JMSException error codes</p> |
| <table summary="HA Replication Colocation Policy" border="1"> |
| <colgroup> |
| <col> |
| <col> |
| </colgroup> |
| <thead> |
| <tr> |
| <th>Error code</th> |
| <th>Description</th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <td>FAILOVER</td> |
| <td>Failover has occurred and we have successfully reattached or reconnected.</td> |
| </tr> |
| <tr> |
| <td>DISCONNECT</td> |
| <td>No failover has occurred and we are disconnected.</td> |
| </tr> |
| </tbody> |
| </table> |
| |
| <h3 id="application-level-failover">Application-Level Failover</h3> |
| <p>In some cases you may not want automatic client failover, and prefer to |
| handle any connection failure yourself, and code your own manually |
| reconnection logic in your own failure handler. We define this as |
| <em>application-level</em> failover, since the failover is handled at the user |
| application level.</p> |
| <p>To implement application-level failover, if you're using JMS then you |
| need to set an <code>ExceptionListener</code> class on the JMS connection. The |
| <code>ExceptionListener</code> will be called by Apache ActiveMQ Artemis in the event that |
| connection failure is detected. In your <code>ExceptionListener</code>, you would |
| close your old JMS connections, potentially look up new connection |
| factory instances from JNDI and creating new connections.</p> |
| <p>For a working example of application-level failover, please see <a href="examples.html">the examples</a> chapter.</p> |
| <p>If you are using the core API, then the procedure is very similar: you |
| would set a <code>FailureListener</code> on the core <code>ClientSession</code> instances.</p> |
| |
| |
| </section> |
| |
| </div> |
| <div class="search-results"> |
| <div class="has-results"> |
| |
| <h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1> |
| <ul class="search-results-list"></ul> |
| |
| </div> |
| <div class="no-results"> |
| |
| <h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1> |
| |
| </div> |
| </div> |
| </div> |
| |
| </div> |
| </div> |
| |
| </div> |
| |
| |
| |
| <a href="clusters.html" class="navigation navigation-prev " aria-label="Previous page: Clusters"> |
| <i class="fa fa-angle-left"></i> |
| </a> |
| |
| |
| <a href="graceful-shutdown.html" class="navigation navigation-next " aria-label="Next page: Graceful Server Shutdown"> |
| <i class="fa fa-angle-right"></i> |
| </a> |
| |
| |
| |
| </div> |
| |
| <script> |
| var gitbook = gitbook || []; |
| gitbook.push(function() { |
| gitbook.page.hasChanged({"page":{"title":"High Availability and Failover","level":"1.49","depth":1,"next":{"title":"Graceful Server Shutdown","level":"1.50","depth":1,"path":"graceful-shutdown.md","ref":"graceful-shutdown.md","articles":[]},"previous":{"title":"Clusters","level":"1.48","depth":1,"path":"clusters.md","ref":"clusters.md","articles":[]},"dir":"ltr"},"config":{"plugins":[],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"github":"apache/activemq-artemis","theme":"default","githubHost":"https://github.com/","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"ActiveMQ Artemis Documentation","links":{"home":"http://activemq.apache.org/artemis","issues":"https://issues.apache.org/jira/browse/ARTEMIS","contribute":"http://activemq.apache.org/contributing.html"},"gitbook":"3.x.x","description":"ActiveMQ Artemis User Guide and Reference Documentation"},"file":{"path":"ha.md","mtime":"2017-11-01T05:40:43.522Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2017-11-01T05:47:39.664Z"},"basePath":".","book":{"language":""}}); |
| }); |
| </script> |
| </div> |
| |
| |
| <script src="gitbook/gitbook.js"></script> |
| <script src="gitbook/theme.js"></script> |
| |
| |
| <script src="gitbook/gitbook-plugin-search/search-engine.js"></script> |
| |
| |
| |
| <script src="gitbook/gitbook-plugin-search/search.js"></script> |
| |
| |
| |
| <script src="gitbook/gitbook-plugin-lunr/lunr.min.js"></script> |
| |
| |
| |
| <script src="gitbook/gitbook-plugin-lunr/search-lunr.js"></script> |
| |
| |
| |
| <script src="gitbook/gitbook-plugin-sharing/buttons.js"></script> |
| |
| |
| |
| <script src="gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script> |
| |
| |
| |
| </body> |
| </html> |
| |