From 6cf22b0cb1f2dc4d8910409284fa5757a7dd67a1 Mon Sep 17 00:00:00 2001
From: John Garbutt <john.garbutt@stackhpc.com>
Date: Fri, 17 Dec 2021 17:34:44 +0000
Subject: [PATCH] Improve RabbitMQ performance by reducing ha replicas

Currently we do not follow the RabbitMQ advice on replicas here:
https://www.rabbitmq.com/ha.html#replication-factor

Here we reduce the number of replicas to n // 2 + 1 as advised
above. The hope it this helps speed up recovery from rabbit
issues.

Related-Bug: #1954925
Change-Id: Ib6bcb26c499c9884faa4a0cd51abaec00cacb096
---
 ansible/roles/rabbitmq/defaults/main.yml       |  8 +++++++-
 ...a-improve-performance-8f29c7657d2999dd.yaml | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 releasenotes/notes/rabbitmq-ha-improve-performance-8f29c7657d2999dd.yaml

diff --git a/ansible/roles/rabbitmq/defaults/main.yml b/ansible/roles/rabbitmq/defaults/main.yml
index ee983d672..4c059eadb 100644
--- a/ansible/roles/rabbitmq/defaults/main.yml
+++ b/ansible/roles/rabbitmq/defaults/main.yml
@@ -89,10 +89,16 @@ rabbitmq_cluster_partition_handling: "pause_minority"
 # More details see:
 # https://www.rabbitmq.com/ha.html#promoting-unsynchronised-mirrors
 rabbitmq_ha_promote_on_shutdown:
+# The number of rabbitmq replicas should follow this advice:
+# https://www.rabbitmq.com/ha.html#replication-factor
+# This means, if you have three rabbit nodes, we request two
+# replicas of all queues and exchanges.
+# Note: this assumes an odd number of rabbitmq nodes.
 # If no replica count is specified, replicates across all nodes with definition
 # "ha-mode":"all". Otherwise, uses
 # "ha-mode":"exactly","ha-params":{{ rabbitmq_ha_replica_count | int }}
-rabbitmq_ha_replica_count:
+rabbitmq_server_count: "{{ groups[role_rabbitmq_groups] | length }}"
+rabbitmq_ha_replica_count: "{{ (rabbitmq_server_count | int // 2 + 1) }}"
 rabbitmq_extra_config: {}
 
 ####################
diff --git a/releasenotes/notes/rabbitmq-ha-improve-performance-8f29c7657d2999dd.yaml b/releasenotes/notes/rabbitmq-ha-improve-performance-8f29c7657d2999dd.yaml
new file mode 100644
index 000000000..a21f7061b
--- /dev/null
+++ b/releasenotes/notes/rabbitmq-ha-improve-performance-8f29c7657d2999dd.yaml
@@ -0,0 +1,18 @@
+---
+upgrade:
+  - |
+    RabbitMQ replica count has changed from n to (n//2+1) where n is the number
+    of RabbitMQ nodes. That is, for a 3 node clusters, we request exactly 2
+    replicas, for a 1 node cluster, we request 1 replica, and for a 5 node
+    cluster, we request 3 replicas. This only has an effect if
+    `om_enable_rabbitmq_high_availability` is set to `True`, otherwise queues
+    are not replicated. The number of mirrored queues is not changed
+    automatically, and instead requires the queues to be recreated (for
+    example, by restarting RabbitMQ).
+    This follows the good practice advice here:
+    https://www.rabbitmq.com/ha.html#replication-factor
+    A major motivation is to reduce the load on RabbitMQ in larger
+    deployments. It is hoped, the improved performance should also
+    help rabbitmq recover more quickly from cluster issues.
+    Note that the contents of the RabbitMQ definitions.json are now changed,
+    meaning RabbitMQ containers will be restarted on next deploy/upgrade.
-- 
GitLab