From a3fcc07c7a52d72cee497f1abc8e4b8bb661557a Mon Sep 17 00:00:00 2001
From: Victor Chembaev <chembervint@gmail.com>
Date: Fri, 24 May 2024 11:15:40 +0300
Subject: [PATCH] Fix octavia-interface timeout

Added Restart=on-failure policy to octavia-interface systemd unit
Added octavia_interface_wait_timeout variable to control
TimeoutStartSec in octavia-interface systemd unit

Change-Id: I9de6c27131ce78e85aac56ea5d91d9740fd58354
Closes-Bug: 2067036
---
 .../templates/octavia-interface.service.j2     |  4 ++++
 doc/source/reference/networking/octavia.rst    | 18 ++++++++++++++++++
 ...via-interface-timeout-5e87ea2501d5ab3c.yaml | 10 ++++++++++
 3 files changed, 32 insertions(+)
 create mode 100644 releasenotes/notes/fix-octavia-interface-timeout-5e87ea2501d5ab3c.yaml

diff --git a/ansible/roles/octavia/templates/octavia-interface.service.j2 b/ansible/roles/octavia/templates/octavia-interface.service.j2
index 7f04d9fb42..532cdc72e5 100644
--- a/ansible/roles/octavia/templates/octavia-interface.service.j2
+++ b/ansible/roles/octavia/templates/octavia-interface.service.j2
@@ -7,6 +7,10 @@ After=docker.service
 Type=oneshot
 User=root
 Group=root
+Restart=on-failure
+{% if octavia_interface_wait_timeout is defined %}
+TimeoutStartSec={{ octavia_interface_wait_timeout }}
+{% endif %}
 RemainAfterExit=true
 ExecStartPre=/sbin/ip link set dev {{ octavia_network_interface }} address {{ port_info.port.mac_address }}
 ExecStart=/sbin/dhclient -v {{ octavia_network_interface }} -cf /etc/dhcp/octavia-dhclient.conf
diff --git a/doc/source/reference/networking/octavia.rst b/doc/source/reference/networking/octavia.rst
index 72d2a04432..53f266065f 100644
--- a/doc/source/reference/networking/octavia.rst
+++ b/doc/source/reference/networking/octavia.rst
@@ -437,6 +437,24 @@ Add ``octavia_network_type`` to ``globals.yml`` and set the value to ``tenant``
 
 Next,follow the deployment instructions as normal.
 
+Failure handling
+----------------
+
+On large deployments, where neutron-openvswitch-agent sync could takes
+more then 5 minutes, you can get an error on octavia-interface.service
+systemd unit, because it can't wait either o-hm0 interface is already
+attached to br-int, or octavia management VxLAN is already configured
+on that host. In this case you have to add ``octavia_interface_wait_timeout``
+to ``globals.yml`` and set the value to new timeout in seconds
+
+.. code-block:: yaml
+
+   octavia_interface_wait_timeout: 1800
+
+On deployments with up to 2500 network ports per network node sync process
+could take up to 30mins. But you have to consider this value according
+to your deployment size.
+
 OVN provider
 ============
 
diff --git a/releasenotes/notes/fix-octavia-interface-timeout-5e87ea2501d5ab3c.yaml b/releasenotes/notes/fix-octavia-interface-timeout-5e87ea2501d5ab3c.yaml
new file mode 100644
index 0000000000..1046e1ede5
--- /dev/null
+++ b/releasenotes/notes/fix-octavia-interface-timeout-5e87ea2501d5ab3c.yaml
@@ -0,0 +1,10 @@
+---
+fixes:
+  - |
+    Fixes 2067036.
+    Added ``octavia_interface_wait_timeout`` to control
+    octavia-interface.service timeout to be able wait
+    openvswitch agent sync has been finished and
+    octavia-lb-net is reachable from the host.
+    Also set restart policy for this unit to on-failure
+    `LP#2067036 <https://launchpad.net/bugs/2067036>`__
-- 
GitLab