heat_template_version: wallaby description: > Pacemaker service configured with Puppet parameters: ServiceData: default: {} description: Dictionary packing service data type: json ServiceNetMap: default: {} description: Mapping of service_name -> network name. Typically set via parameter_defaults in the resource registry. Use parameter_merge_strategies to merge it with the defaults. type: json RoleName: default: '' description: Role name on which the service is applied type: string RoleParameters: default: {} description: Parameters specific to the role type: json EndpointMap: default: {} description: Mapping of service endpoint -> protocol. Typically set via parameter_defaults in the resource registry. type: json MonitoringSubscriptionPacemaker: default: 'overcloud-pacemaker' type: string CorosyncIPv6: default: false description: Enable IPv6 in Corosync type: boolean EnableFencing: default: false description: Whether to enable fencing in Pacemaker or not. type: boolean PacemakerTLSPriorities: type: string description: Pacemaker TLS Priorities default: '' PacemakerRemoteAuthkey: type: string description: The authkey for the pacemaker remote service. hidden: true PcsdPassword: type: string description: The password for the 'pcsd' user for pacemaker. hidden: true CorosyncTokenTimeout: type: number description: Time in milliseconds until a token loss is declared after not receiving a token. default: 10000 CorosyncSettleTries: type: number description: Number of tries for cluster settling. This has the same default as the pacemaker puppet module. Override to a smaller value when in need to replace a controller node. default: 360 FencingConfig: default: {} description: | Pacemaker fencing configuration. The JSON should have the following structure: { "devices": [ { "agent": "AGENT_NAME", "host_mac": "HOST_MAC_ADDRESS", "params": {"PARAM_NAME": "PARAM_VALUE"} } ] } For instance: { "devices": [ { "agent": "fence_xvm", "host_mac": "52:54:00:aa:bb:cc", "params": { "multicast_address": "225.0.0.12", "port": "baremetal_0", "manage_fw": true, "manage_key_file": true, "key_file": "/etc/fence_xvm.key", "key_file_password": "abcdef" } } ] } type: json PacemakerLoggingSource: type: json default: tag: system.pacemaker file: /var/log/host/pacemaker/pacemaker.log startmsg.regex: "^[a-zA-Z]{3} [0-9]{2} [:0-9]{8}" ContainerCli: type: string default: 'podman' description: CLI tool used to manage containers. constraints: - allowed_values: ['docker', 'podman'] EnableInstanceHA: default: false description: Whether to enable an Instance Ha configurarion or not. This setup requires the Compute role to have the PacemakerRemote service added to it. type: boolean PacemakerBundleOperationTimeout: type: string default: '' description: The timeout for start, monitor and stop operations run by the container resource agent, in seconds. When set to default '', the timeout comes from pacemaker's default operation timeouts (20s). When set to default and podman is used, force the timeout to 120s. constraints: - allowed_pattern: "([1-9][0-9]*s)?" parameter_groups: - label: deprecated description: | The following parameters are deprecated and will be removed. They should not be relied on for new deployments. If you have concerns regarding deprecated parameters, please contact the TripleO development team on IRC or the OpenStack mailing list. parameters: - CorosyncIPv6 conditions: pcmk_tls_priorities_empty: {equals: [{get_param: PacemakerTLSPriorities}, '']} pcmk_bundle_op_timeout_empty: {equals: [{get_param: PacemakerBundleOperationTimeout}, '']} podman_enabled: {equals: [{get_param: ContainerCli}, 'podman']} is_ipv6: equals: - {get_param: [ServiceData, net_ip_version_map, {get_param: [ServiceNetMap, PacemakerNetwork]}]} - 6 outputs: role_data: description: Role data for the Pacemaker role. value: service_name: pacemaker monitoring_subscription: {get_param: MonitoringSubscriptionPacemaker} firewall_rules: '130 pacemaker tcp': proto: 'tcp' dport: - 2224 - 3121 - 21064 '131 pacemaker udp': proto: 'udp' dport: 5405 config_settings: map_merge: - pacemaker::corosync::cluster_name: 'tripleo_cluster' pacemaker::corosync::manage_fw: false pacemaker::resource_defaults::defaults: resource-stickiness: { value: INFINITY } corosync_token_timeout: {get_param: CorosyncTokenTimeout} pacemaker::corosync::settle_tries: {get_param: CorosyncSettleTries} pacemaker::resource::bundle::deep_compare: true pacemaker::resource::ip::deep_compare: true pacemaker::resource::ocf::deep_compare: true corosync_ipv6: {if: [is_ipv6, true, false]} tripleo::fencing::config: {get_param: FencingConfig} tripleo::fencing::deep_compare: true enable_fencing: {get_param: EnableFencing} hacluster_pwd: {get_param: PcsdPassword} tripleo::profile::base::pacemaker::remote_authkey: {get_param: PacemakerRemoteAuthkey} tripleo::profile::base::pacemaker::pcsd_bind_addr: str_replace: template: "%{hiera('$NETWORK')}" params: $NETWORK: {get_param: [ServiceNetMap, PacemakerNetwork]} - if: - pcmk_tls_priorities_empty - {} - tripleo::pacemaker::tls_priorities: {get_param: PacemakerTLSPriorities} - if: - and: - pcmk_bundle_op_timeout_empty - not: podman_enabled - {} - tripleo::profile::base::pacemaker::resource_op_defaults: bundle: name: timeout value: if: - pcmk_bundle_op_timeout_empty - '120s' - {get_param: PacemakerBundleOperationTimeout} service_config_settings: rsyslog: tripleo_logging_sources_pacemaker: - {get_param: PacemakerLoggingSource} step_config: | include tripleo::profile::base::pacemaker host_prep_tasks: # Need this until https://bugzilla.redhat.com/show_bug.cgi?id=1857247 is fixed - name: Make sure python3-novaclient is installed when IHA is enabled package: name: python3-novaclient state: present when: {get_param: EnableInstanceHA} - name: Remove existing entries from logind conf ansible.builtin.lineinfile: path: /etc/systemd/logind.conf regexp: '^\s*#?\s*HandlePowerKey\s*=.*' state: absent - name: Make sure systemd-logind ignores power off ansible.builtin.lineinfile: path: /etc/systemd/logind.conf regexp: '^#?HandlePowerKey' line: HandlePowerKey=ignore - name: Restart systemd-logind ansible.builtin.service: name: systemd-logind state: restarted - name: Gather service_facts on pacemaker_bootstrap_node ansible.builtin.service_facts: when: - "{{(pacemaker_short_bootstrap_node_name|lower == ansible_facts['hostname']|lower)|bool}}" - {get_param: EnableInstanceHA} - name: Check and eventually delete duplicate constraints (bootstrap node) become: yes shell: | COUNT=$(cibadmin --query | xmllint --xpath '//rsc_location[@rsc="stonith-fence_compute-fence-nova"]/@id' - |grep -oP '(?<=["])[^"]*' -c) if [[ $COUNT > 1 ]]; then echo "Detected $COUNT duplicate constraints, deleting them" # assemble string with current list of computes GOOD=$(echo location-stonith-fence_compute-fence-nova-$(hiera pacemaker_remote_short_node_names |grep -oP '".*?"' |tr -d '"'|tr -d '\n' )--10000) # delete old constraints for i in $(cibadmin --query | xmllint --xpath '//rsc_location[@rsc="stonith-fence_compute-fence-nova"]/@id' - |grep -oP '(?<=["])[^"]*' |grep -v $GOOD ); do pcs constraint delete $i ; done else echo "No duplicate constraint found" fi when: - "{{(pacemaker_short_bootstrap_node_name|lower == ansible_facts['hostname']|lower)|bool}}" - {get_param: EnableInstanceHA} - ansible_facts['services']['pacemaker.service']['state'] == 'running' scale_tasks: - when: - step|int == 1 - container_cli == 'podman' - '"pacemaker_remote" in enabled_services|list' - inventory_hostname_short in compute_instanceha_short_node_names tags: down become: true block: - name: Getting Nova compute hostname command: crm_node -n register: nova_compute_hostname - name: Check if pacemaker_short_bootstrap_node_name is a defined variable when: pacemaker_short_bootstrap_node_name is not defined fail: msg: 'Cannot delegate pacemaker cleanup to the bootstrap node. Please delete the resources manually.' ignore_errors: yes - name: Clean up Pacemaker remote and STONITH resources for Compute node when: - nova_compute_hostname.stdout in compute_instanceha_short_node_names - pacemaker_short_bootstrap_node_name is defined block: - name: List STONITH resource for the Compute node command: stonith_admin -l {{ nova_compute_hostname.stdout }} register: stonith_service_results delegate_to: "{{ pacemaker_short_bootstrap_node_name }}" - name: Disable the STONITH resources for the Compute node command: pcs stonith disable "{{ item }}" loop: "{{ stonith_service_results.stdout | regex_findall('^(stonith-fence(?!_compute-fence-nova).*)', multiline=True)}}" delegate_to: "{{ pacemaker_short_bootstrap_node_name }}" register: disable_output failed_when: disable_output.rc != 0 retries: 3 delay: 5 until: disable_output.rc == 0 - name: Delete the STONITH resources for the Compute node command: pcs stonith delete "{{ item }}" loop: "{{ stonith_service_results.stdout | regex_findall('^(stonith-fence(?!_compute-fence-nova).*)', multiline=True)}}" delegate_to: "{{ pacemaker_short_bootstrap_node_name }}" register: delete_output failed_when: delete_output.rc != 0 retries: 3 delay: 5 until: delete_output.rc == 0 - name: Clear the stonith level hierarchy for the Compute node target command: pcs stonith level clear target {{ nova_compute_hostname.stdout }} delegate_to: "{{ pacemaker_short_bootstrap_node_name }}" - name: Delete Compute node from cluster command: pcs cluster node remove-remote {{ nova_compute_hostname.stdout }} delegate_to: "{{ pacemaker_short_bootstrap_node_name }}" - name: Remove compute node in pcmk_host_list parameter from stonith-fence_compute-fence-nova command: crm_resource --set-parameter=pcmk_host_list -r stonith-fence_compute-fence-nova -v "{{ pacemaker_remote_short_node_names | reject('search', nova_compute_hostname.stdout) | join(',') }}" delegate_to: "{{ pacemaker_short_bootstrap_node_name }}" upgrade_tasks: # Since Wallaby, Redis is not deployed by defaut anymore - name: Ensure redis is removed when: - step|int == 5 - '"redis" not in enabled_services|list' - "{{ (pacemaker_short_bootstrap_node_name|lower == ansible_facts['hostname']|lower)|bool}}" become: true shell: | if crm_resource -r redis-bundle -q &>/dev/null; then pcs resource delete redis-bundle || true pcs resource delete ip-$(hiera redis_vip) || true fi - name: Clean up cluster node cache when: - step|int == 5 - '"redis" not in enabled_services|list' - "{{ (pacemaker_short_bootstrap_node_name|lower == ansible_facts['hostname']|lower)|bool}}" become: true shell: | pcs cluster node clear redis-bundle-0 pcs cluster node clear redis-bundle-1 pcs cluster node clear redis-bundle-2 crm_attribute --name redis_REPL_INFO --delete - name: Clean up redis attribute when: - step|int == 5 - '"redis" not in enabled_services|list' - "{{ (pacemaker_short_bootstrap_node_name|lower == ansible_facts['hostname']|lower)|bool}}" become: true shell: | pcs node attribute "{{ item }}" redis-role= || true loop: "{{ pacemaker_short_node_names }}" external_upgrade_tasks: - when: - step|int == 1 tags: - never - system_upgrade_stop_services - system_upgrade_transfer_data block: - name: Stop cluster become: true shell: | set -eu FILE=/usr/sbin/pcs if test -f "$FILE"; then /usr/sbin/pcs cluster stop --force fi delegate_to: "{{ item }}" with_items: "{{ groups['pacemaker'] | difference(groups['excluded_overcloud']) }}" update_tasks: - name: Check pacemaker cluster running before the minor update when: step|int == 0 # TODO(marios) disabling validations? # NOTE: We are intentionally not using the community version of # pacemaker_cluster here due to variances between the two: # https://bugs.launchpad.net/tripleo/+bug/1938967 pacemaker_cluster: state=online check_and_fail=true async: 30 poll: 4 - name: Acquire the cluster shutdown lock to stop pacemaker cluster when: step|int == 1 command: systemd-cat -t ha-shutdown /var/lib/container-config-scripts/pacemaker_mutex_shutdown.sh --acquire - name: Stop pacemaker cluster when: step|int == 1 pacemaker_cluster: state=offline - name: Start pacemaker cluster when: step|int == 4 pacemaker_cluster: state=online - name: Release the cluster shutdown lock when: step|int == 4 command: systemd-cat -t ha-shutdown /var/lib/container-config-scripts/pacemaker_mutex_shutdown.sh --release