From 9f14ad651a9e6516d02c90d9eb0ec4b7a4702e7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rados=C5=82aw=20Piliszek?= <radoslaw.piliszek@gmail.com>
Date: Fri, 3 Jan 2020 11:20:00 +0100
Subject: [PATCH] Fix multiple issues with MariaDB handling

These affected both deploy (and reconfigure) and upgrade
resulting in WSREP issues, failed deploys or need to
recover the cluster.

This patch makes sure k-a does not abruptly terminate
nodes to break cluster.
This is achieved by cleaner separation between stages
(bootstrap, restart current, deploy new) and 3 phases
for restarts (to keep the quorum).

Upgrade actions, which operate on a healthy cluster,
went to its section.

Service restart was refactored.

We no longer rely on the master/slave distinction as
all nodes are masters in Galera.

Closes-bug: #1857908
Closes-bug: #1859145
Change-Id: I83600c69141714fc412df0976f49019a857655f5
---
 ansible/roles/mariadb/handlers/main.yml       | 173 ++++--------------
 ansible/roles/mariadb/tasks/bootstrap.yml     |   6 +-
 .../roles/mariadb/tasks/deploy-containers.yml |   7 +-
 .../roles/mariadb/tasks/lookup_cluster.yml    |  98 +++++-----
 .../roles/mariadb/tasks/recover_cluster.yml   |   4 +-
 ansible/roles/mariadb/tasks/register.yml      |  19 --
 .../roles/mariadb/tasks/restart_services.yml  |  46 +++++
 ansible/roles/mariadb/tasks/upgrade.yml       |  24 +++
 ...tiple-mariadb-issues-d023478df6d76622.yaml |  17 ++
 9 files changed, 183 insertions(+), 211 deletions(-)
 create mode 100644 ansible/roles/mariadb/tasks/restart_services.yml
 create mode 100644 releasenotes/notes/fix-multiple-mariadb-issues-d023478df6d76622.yaml

diff --git a/ansible/roles/mariadb/handlers/main.yml b/ansible/roles/mariadb/handlers/main.yml
index 7ac1f0f6b..4aaee273b 100644
--- a/ansible/roles/mariadb/handlers/main.yml
+++ b/ansible/roles/mariadb/handlers/main.yml
@@ -17,16 +17,10 @@
     restart_policy: no
     volumes: "{{ service.volumes }}"
     dimensions: "{{ service.dimensions }}"
-  when:
-    - bootstrap_host is defined
-    - bootstrap_host == inventory_hostname
   listen: Bootstrap MariaDB cluster
-  notify:
-    - restart mariadb
 
-# TODO(jeffrey4l), remove the task check when the wait_for bug is fixed
-# https://github.com/ansible/ansible-modules-core/issues/2788
-- name: wait first mariadb container
+# NOTE(yoctozepto): We have to loop this to avoid breaking on connection resets
+- name: Wait for first MariaDB service port liveness
   wait_for:
     host: "{{ api_interface_address }}"
     port: "{{ mariadb_port }}"
@@ -37,157 +31,60 @@
   until: check_mariadb_port is success
   retries: 10
   delay: 6
-  when:
-    - bootstrap_host is defined
-    - bootstrap_host == inventory_hostname
   listen: Bootstrap MariaDB cluster
 
-- name: Wait for MariaDB to become operational
+- name: Wait for first MariaDB service to sync WSREP
   become: true
   command: >-
     docker exec {{ mariadb_service.container_name }}
     mysql -uroot -p{{ database_password }}
     --silent --skip-column-names
-    -e 'SHOW STATUS LIKE "wsrep_evs_state"'
+    -e 'SHOW STATUS LIKE "wsrep_local_state_comment"'
   changed_when: false
   register: result
-  until: '"OPERATIONAL" in result.stdout'
+  until: result.stdout == "wsrep_local_state_comment\tSynced"
   retries: 10
   delay: 6
   no_log: true
-  when:
-    - bootstrap_host is defined
-    - bootstrap_host == inventory_hostname
   listen: Bootstrap MariaDB cluster
 
-- name: restart slave mariadb
-  vars:
-    service_name: "mariadb"
-    service: "{{ mariadb_services[service_name] }}"
+- name: Creating haproxy mysql user
   become: true
-  kolla_docker:
-    action: "recreate_or_restart_container"
-    common_options: "{{ docker_common_options }}"
-    name: "{{ service.container_name }}"
-    image: "{{ service.image }}"
-    volumes: "{{ service.volumes }}"
-    dimensions: "{{ service.dimensions }}"
-  when:
-    - kolla_action != "config"
-    - inventory_hostname != master_host
-    - not mariadb_recover | default(false)
-  listen: restart mariadb
-
-# TODO(jeffrey4l), remove the task check when the wait_for bug is fixed
-# https://github.com/ansible/ansible-modules-core/issues/2788
-- name: wait for slave mariadb
-  wait_for:
-    host: "{{ api_interface_address }}"
-    port: "{{ mariadb_port }}"
-    connect_timeout: 1
-    timeout: 60
-    search_regex: "MariaDB"
-  register: check_mariadb_port
-  until: check_mariadb_port is success
-  retries: 10
-  delay: 6
-  when:
-    - kolla_action != "config"
-    - inventory_hostname != master_host
-    - not mariadb_recover | default(false)
-  listen: restart mariadb
-
-- name: run upgrade on slave
-  vars:
-    service_name: "mariadb"
-    service: "{{ mariadb_services[service_name] }}"
-  become: true
-  kolla_docker:
-    action: "start_container"
-    common_options: "{{ docker_common_options }}"
-    detach: False
-    dimensions: "{{ service.dimensions }}"
-    environment:
-      KOLLA_UPGRADE:
-      KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
-      DB_HOST: "{{ api_interface_address }}"
-      DB_PORT: "{{ mariadb_port }}"
-      DB_ROOT_PASSWORD: "{{ database_password }}"
-    image: "{{ service.image }}"
-    labels:
-      UPGRADE:
-    name: "upgrade_mariadb"
-    restart_policy: no
-    volumes: "{{ service.volumes }}"
-  no_log: true
-  when:
-    - kolla_action == "upgrade"
-    - inventory_hostname != master_host
-    - not mariadb_recover | default(false)
-  listen: restart mariadb
+  kolla_toolbox:
+    module_name: mysql_user
+    module_args:
+      login_host: "{{ api_interface_address }}"
+      login_port: "{{ mariadb_port }}"
+      login_user: "{{ database_user }}"
+      login_password: "{{ database_password }}"
+      name: "haproxy"
+      password: ""
+      host: "%"
+      priv: "*.*:USAGE"
+  listen: Bootstrap MariaDB cluster
 
-- name: restart master mariadb
-  vars:
-    service_name: "mariadb"
-    service: "{{ mariadb_services[service_name] }}"
-  become: true
-  kolla_docker:
-    action: "recreate_or_restart_container"
-    common_options: "{{ docker_common_options }}"
-    name: "{{ service.container_name }}"
-    image: "{{ service.image }}"
-    volumes: "{{ service.volumes }}"
-    dimensions: "{{ service.dimensions }}"
+- name: Restart MariaDB on existing cluster members
+  include_tasks: 'restart_services.yml'
   when:
+    - groups.mariadb_port_alive_True is defined
+    - inventory_hostname in groups.mariadb_port_alive_True
+    - groups.mariadb_port_alive_True.index(inventory_hostname) % 3 == item
     - kolla_action != "config"
-    - inventory_hostname == master_host
-    - not mariadb_recover | default(false)
   listen: restart mariadb
+  loop:
+    - 0
+    - 1
+    - 2
 
-# TODO(jeffrey4l), remove the task check when the wait_for bug is fixed
-# https://github.com/ansible/ansible-modules-core/issues/2788
-- name: Waiting for master mariadb
-  wait_for:
-    host: "{{ api_interface_address }}"
-    port: "{{ mariadb_port }}"
-    connect_timeout: 1
-    timeout: 60
-    search_regex: "MariaDB"
-  register: check_mariadb_port
-  until: check_mariadb_port is success
-  retries: 10
-  delay: 6
+- name: Start MariaDB on new nodes
+  include_tasks: 'restart_services.yml'
   when:
+    - bootstrap_host is not defined or bootstrap_host != inventory_hostname
+    - groups.mariadb_port_alive_False is defined
+    - inventory_hostname in groups.mariadb_port_alive_False
     - kolla_action != "config"
-    - inventory_hostname == master_host
-    - not mariadb_recover | default(false)
   listen: restart mariadb
 
-- name: run upgrade on master
-  vars:
-    service_name: "mariadb"
-    service: "{{ mariadb_services[service_name] }}"
-  become: true
-  kolla_docker:
-    action: "start_container"
-    common_options: "{{ docker_common_options }}"
-    detach: False
-    dimensions: "{{ service.dimensions }}"
-    environment:
-      KOLLA_UPGRADE:
-      KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
-      DB_HOST: "{{ api_interface_address }}"
-      DB_PORT: "{{ mariadb_port }}"
-      DB_ROOT_PASSWORD: "{{ database_password }}"
-    image: "{{ service.image }}"
-    labels:
-      UPGRADE:
-    name: "upgrade_mariadb"
-    restart_policy: no
-    volumes: "{{ service.volumes }}"
-  no_log: true
-  when:
-    - kolla_action == "upgrade"
-    - inventory_hostname == master_host
-    - not mariadb_recover | default(false)
-  listen: restart mariadb
+- name: Ensure MariaDB is running normally on bootstrap host
+  include_tasks: 'restart_services.yml'
+  listen: Bootstrap MariaDB cluster
diff --git a/ansible/roles/mariadb/tasks/bootstrap.yml b/ansible/roles/mariadb/tasks/bootstrap.yml
index 249af27e0..279f3a43d 100644
--- a/ansible/roles/mariadb/tasks/bootstrap.yml
+++ b/ansible/roles/mariadb/tasks/bootstrap.yml
@@ -1,13 +1,9 @@
 ---
-- name: Set a fact about the master host
-  set_fact:
-    master_host: "{{ groups['mariadb'][0] }}"
-
 - include_tasks: lookup_cluster.yml
 
 - include_tasks: bootstrap_cluster.yml
   when:
-    - not has_cluster | bool
+    - not mariadb_cluster_exists
     - inventory_hostname == groups['mariadb'][0]
 
 - include_tasks: recover_cluster.yml
diff --git a/ansible/roles/mariadb/tasks/deploy-containers.yml b/ansible/roles/mariadb/tasks/deploy-containers.yml
index 9be12d61a..8a1a70869 100644
--- a/ansible/roles/mariadb/tasks/deploy-containers.yml
+++ b/ansible/roles/mariadb/tasks/deploy-containers.yml
@@ -1,6 +1,5 @@
 ---
-- name: Set a fact about the master host
-  set_fact:
-    master_host: "{{ groups['mariadb'][0] }}"
-
 - import_tasks: check-containers.yml
+
+# NOTE(yoctozepto): handlers prerequisite
+- import_tasks: lookup_cluster.yml
diff --git a/ansible/roles/mariadb/tasks/lookup_cluster.yml b/ansible/roles/mariadb/tasks/lookup_cluster.yml
index 4f7bedf72..35f0b5b3f 100644
--- a/ansible/roles/mariadb/tasks/lookup_cluster.yml
+++ b/ansible/roles/mariadb/tasks/lookup_cluster.yml
@@ -1,25 +1,5 @@
 ---
-- name: Cleaning up temp file on localhost
-  file:
-    path: /tmp/kolla_mariadb_cluster
-    state: absent
-  delegate_to: localhost
-  changed_when: False
-  check_mode: no
-  run_once: True
-
-# NOTE(mnasiadka): Due to the way that we are setting fact has_cluster - content needs to be ''
-- name: Creating temp file on localhost
-  copy:
-    content: ''
-    dest: /tmp/kolla_mariadb_cluster
-    mode: 0644
-  delegate_to: localhost
-  changed_when: False
-  check_mode: no
-  run_once: True
-
-- name: Creating mariadb volume
+- name: Create MariaDB volume
   become: true
   kolla_docker:
     action: "create_volume"
@@ -27,25 +7,59 @@
     name: "mariadb"
   register: mariadb_volume
 
-- name: Writing hostname of host with existing cluster files to temp file
-  copy:
-    content: "{{ ansible_hostname }}"
-    dest: /tmp/kolla_mariadb_cluster
-    mode: 0644
-  delegate_to: localhost
-  changed_when: False
-  check_mode: no
-  when: mariadb_volume is not changed
-
-- name: Registering host from temp file
+- name: Divide hosts by their MariaDB volume availability
+  group_by:
+    key: mariadb_had_volume_{{ mariadb_volume is not changed }}
+
+- name: Establish whether the cluster has already existed
   set_fact:
-    has_cluster: "{{ lookup('file', '/tmp/kolla_mariadb_cluster') | length > 0 }}"
-
-- name: Cleaning up temp file on localhost
-  file:
-    path: /tmp/kolla_mariadb_cluster
-    state: absent
-  delegate_to: localhost
-  changed_when: False
-  check_mode: no
-  run_once: True
+    mariadb_cluster_exists: "{{ groups.mariadb_had_volume_True is defined }}"
+
+- block:
+    - name: Check MariaDB service port liveness
+      wait_for:
+        host: "{{ api_interface_address }}"
+        port: "{{ mariadb_port }}"
+        connect_timeout: 1
+        timeout: 10
+        search_regex: "MariaDB"
+      register: check_mariadb_port_liveness
+      ignore_errors: yes
+
+    - name: Divide hosts by their MariaDB service port liveness
+      group_by:
+        key: mariadb_port_alive_{{ check_mariadb_port_liveness is success }}
+
+    - block:
+        - name: Check MariaDB service WSREP sync status
+          become: true
+          command: >-
+            docker exec {{ mariadb_service.container_name }}
+            mysql -uroot -p{{ database_password }}
+            --silent --skip-column-names
+            -e 'SHOW STATUS LIKE "wsrep_local_state_comment"'
+          changed_when: false
+          register: check_mariadb_sync_status
+          no_log: true
+
+        # NOTE(yoctozepto): this is extracted separately to properly escape
+        # the TAB character which likes to go wrong due to interaction between
+        # Python/Ansible/Jinja2/YAML, the way below works
+        - name: Extract MariaDB service WSREP sync status
+          set_fact:
+            mariadb_sync_status: "{{ check_mariadb_sync_status.stdout.split('\t')[1] }}"
+
+        - name: Divide hosts by their MariaDB service WSREP sync status
+          group_by:
+            key: mariadb_sync_status_{{ mariadb_sync_status }}
+
+        - name: Fail when MariaDB service is not synced
+          fail:
+            msg: MariaDB service is not synced. Please wait for WSREP sync before proceeding.
+          when:
+            - groups.mariadb_sync_status_Synced is not defined or
+              inventory_hostname not in groups.mariadb_sync_status_Synced
+      when:
+        - groups.mariadb_port_alive_True is defined
+        - inventory_hostname in groups.mariadb_port_alive_True
+  when: not mariadb_recover | default(False)
diff --git a/ansible/roles/mariadb/tasks/recover_cluster.yml b/ansible/roles/mariadb/tasks/recover_cluster.yml
index 4c097d80f..b662e4a9c 100644
--- a/ansible/roles/mariadb/tasks/recover_cluster.yml
+++ b/ansible/roles/mariadb/tasks/recover_cluster.yml
@@ -1,7 +1,7 @@
 ---
 - fail:
     msg: "MariaDB cluster was not found. Is your inventory correct?"
-  when: not has_cluster | bool
+  when: not mariadb_cluster_exists
 
 - name: Cleaning up temp file on mariadb hosts
   file:
@@ -97,8 +97,6 @@
 
 - set_fact:
     bootstrap_host: "{{ mariadb_recover_inventory_name }}"
-    master_host: "{{ mariadb_recover_inventory_name }}"
-  changed_when: true
 
 - name: Copying grastate.dat file from MariaDB container in bootstrap host
   become: true
diff --git a/ansible/roles/mariadb/tasks/register.yml b/ansible/roles/mariadb/tasks/register.yml
index b4b87d97f..ab60c16cc 100644
--- a/ansible/roles/mariadb/tasks/register.yml
+++ b/ansible/roles/mariadb/tasks/register.yml
@@ -1,19 +1,4 @@
 ---
-- name: Creating haproxy mysql user
-  become: true
-  kolla_toolbox:
-    module_name: mysql_user
-    module_args:
-      login_host: "{{ api_interface_address }}"
-      login_port: "{{ mariadb_port }}"
-      login_user: "{{ database_user }}"
-      login_password: "{{ database_password }}"
-      name: "haproxy"
-      password: ""
-      host: "%"
-      priv: "*.*:USAGE"
-  run_once: True
-
 - import_tasks: wait_for_loadbalancer.yml
 
 - name: Creating the Mariabackup database
@@ -65,7 +50,3 @@
   run_once: True
   when:
     - enable_mariabackup | bool
-
-- name: Cleaning up facts
-  set_fact:
-    delegate_host: "bootstraped"
diff --git a/ansible/roles/mariadb/tasks/restart_services.yml b/ansible/roles/mariadb/tasks/restart_services.yml
new file mode 100644
index 000000000..86768d309
--- /dev/null
+++ b/ansible/roles/mariadb/tasks/restart_services.yml
@@ -0,0 +1,46 @@
+---
+- name: Restart MariaDB container
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  become: true
+  kolla_docker:
+    action: "recreate_or_restart_container"
+    common_options: "{{ docker_common_options }}"
+    name: "{{ service.container_name }}"
+    image: "{{ service.image }}"
+    volumes: "{{ service.volumes }}"
+    dimensions: "{{ service.dimensions }}"
+
+# NOTE(yoctozepto): We have to loop this to avoid breaking on connection resets
+- name: Wait for MariaDB service port liveness
+  wait_for:
+    host: "{{ api_interface_address }}"
+    port: "{{ mariadb_port }}"
+    connect_timeout: 1
+    timeout: 60
+    search_regex: "MariaDB"
+  register: check_mariadb_port
+  until: check_mariadb_port is success
+  retries: 10
+  delay: 6
+
+- name: Wait for MariaDB service to sync WSREP
+  become: true
+  command: >-
+    docker exec {{ mariadb_service.container_name }}
+    mysql -uroot -p{{ database_password }}
+    --silent --skip-column-names
+    -e 'SHOW STATUS LIKE "wsrep_local_state_comment"'
+  changed_when: false
+  register: result
+  until: result.stdout == "wsrep_local_state_comment\tSynced"
+  retries: 10
+  delay: 6
+  no_log: true
+  when:
+    # NOTE(yoctozepto): we don't want to wait for new nodes to fully sync
+    # with an existing cluster as this could take time
+    - not mariadb_cluster_exists or
+      (groups.mariadb_port_alive_True is defined and
+      inventory_hostname in groups.mariadb_port_alive_True)
diff --git a/ansible/roles/mariadb/tasks/upgrade.yml b/ansible/roles/mariadb/tasks/upgrade.yml
index f670a5b78..3d7686eab 100644
--- a/ansible/roles/mariadb/tasks/upgrade.yml
+++ b/ansible/roles/mariadb/tasks/upgrade.yml
@@ -1,2 +1,26 @@
 ---
 - include_tasks: deploy.yml
+
+- name: Run upgrade in MariaDB container
+  vars:
+    service_name: "mariadb"
+    service: "{{ mariadb_services[service_name] }}"
+  become: true
+  kolla_docker:
+    action: "start_container"
+    common_options: "{{ docker_common_options }}"
+    detach: False
+    dimensions: "{{ service.dimensions }}"
+    environment:
+      KOLLA_UPGRADE:
+      KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
+      DB_HOST: "{{ api_interface_address }}"
+      DB_PORT: "{{ mariadb_port }}"
+      DB_ROOT_PASSWORD: "{{ database_password }}"
+    image: "{{ service.image }}"
+    labels:
+      UPGRADE:
+    name: "upgrade_mariadb"
+    restart_policy: no
+    volumes: "{{ service.volumes }}"
+  no_log: true
diff --git a/releasenotes/notes/fix-multiple-mariadb-issues-d023478df6d76622.yaml b/releasenotes/notes/fix-multiple-mariadb-issues-d023478df6d76622.yaml
new file mode 100644
index 000000000..564a81c84
--- /dev/null
+++ b/releasenotes/notes/fix-multiple-mariadb-issues-d023478df6d76622.yaml
@@ -0,0 +1,17 @@
+---
+fixes:
+  - |
+    Fixes MariaDB issues in multinode scenarios which affected
+    deployment, reconfiguration, upgrade and Galera cluster resizing.
+    They were usually manifested by WSREP issues in various places
+    and could lead to need to recover the Galera cluster.
+    Note these issues were due to how MariaDB was handled during
+    Kolla Ansible runs and did not affect Galera cluster during normal
+    operations unless MariaDB was later touched by Kolla Ansible.
+    Users wishing to run actions on their Galera clusters using
+    Kolla Ansible are strongly advised to update.
+    For details please see the following Launchpad bug records:
+    `bug 1857908
+    <https://bugs.launchpad.net/kolla-ansible/+bug/1857908>`__ and
+    `bug 1859145
+    <https://bugs.launchpad.net/kolla-ansible/+bug/1859145>`__.
-- 
GitLab