Skip to content

Commit

Permalink
Remove redundant restart (#279)
Browse files Browse the repository at this point in the history
Restarting Elasticsearch takes quite a while and may lead to connection
issues as well as sync issues. So keeping restarts to a minimum is
important. These changes will make sure that, even when the `Restart
Elasticsearch` handler is notified, it will only restart if
Elasticsearch was running before. If there's a fresh start (after
reconfiguration) we don't need to restart again.

Same goes for Logstash and Kibana. Some restarts of these tools happen
fairly fast. But others (like after fresh installs or updates) will
trigger internal jobs that should not be intercepted by another restart.

Beats restart very fast and as far as I know there's not a big downside
to restarting them right after the first start so I didn't include them
in the change.

Additionally, this PR will make sure some tasks in `verify.yml` of the
full stack are only run when the service to be checked is actually
running on this node. This helps with spreading services over nodes to
save ressources.

Since GitHub hosted runners are quite low on ressources we can't run
every service on every node in a cluster setup anymore. So this PR will
make sure that only Elasticsearch runs everywhere and the others are
spread out.

Caches get cleared after every role in during a Molecule test. This
helps with saving ressources, too.

Elasticsearch still won't sync all shards due to full volumes, the
watermarks for Elasticseach are set to extremely high volumes so that
the cluster can at least get into sync.

fixes #278
fixes #141 
fixes #194
  • Loading branch information
widhalmt authored Oct 16, 2023
1 parent 62bf591 commit d5f7f54
Show file tree
Hide file tree
Showing 15 changed files with 183 additions and 61 deletions.
1 change: 1 addition & 0 deletions molecule/elasticsearch_default/converge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
elasticsearch_disable_systemcallfilterchecks: true
elasticstack_release: "{{ lookup('env', 'ELASTIC_RELEASE') | int}}"
elasticsearch_heap: "1"
elasticstack_no_log: false
tasks:
- name: Include Elastics repos role
ansible.builtin.include_role:
Expand Down
3 changes: 2 additions & 1 deletion molecule/elasticstack_default/converge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
vars:
elasticsearch_jna_workaround: true
elasticsearch_disable_systemcallfilterchecks: true
elasticsearch_monitoring_enabled: false
elasticstack_release: "{{ lookup('env', 'ELASTIC_RELEASE') | int}}"
elasticsearch_heap: "1"
elasticsearch_heap: "2"
elasticstack_full_stack: true
elasticstack_no_log: false
logstash_pipeline_unsafe_shutdown: true
Expand Down
2 changes: 0 additions & 2 deletions molecule/elasticstack_default/molecule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ platforms:
groups:
- beats
- logstash
- kibana
- elasticsearch
image: "geerlingguy/docker-${MOLECULE_DISTRO:-centos7}-ansible:latest"
command: ${MOLECULE_DOCKER_COMMAND:-""}
Expand All @@ -22,7 +21,6 @@ platforms:
- name: "elasticstack${ELASTIC_RELEASE}-cluster2-${MOLECULE_DISTRO}"
groups:
- beats
- logstash
- kibana
- elasticsearch
image: "geerlingguy/docker-${MOLECULE_DISTRO:-centos7}-ansible:latest"
Expand Down
78 changes: 41 additions & 37 deletions molecule/elasticstack_default/verify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,42 +59,46 @@
msg: "Elasticsearch received {{ logstash_count.stdout }} events so far"
when: "'elasticsearch' in group_names"

- name: fetch kibana.yml
ansible.builtin.command: cat /etc/kibana/kibana.yml
register: kibanayml

- name: Show kibana.yml
ansible.builtin.debug:
var: kibanayml.stdout_lines

- name: Check for Kibana port
ansible.builtin.wait_for:
port: 5601
timeout: 120

- name: Connect to Kibana
ansible.builtin.command:
curl
-s
-u elastic:{{ elastic_pass.stdout }}
http://{{ ansible_hostname }}:5601/api/status
register: curl_out
failed_when:
- "'green' not in curl_out.stdout"
- "'Elasticsearch is available' not in curl_out.stdout"

# The following might be nicer but doesn't work
#- name: Connect to Kibana
# ansible.builtin.uri:
# url: http://ansible-role-kibana_full_stack:5601/api/status
# user: elastic
# password: "{{ elastic_password.stdout }}"
# return_content: yes
# register: kibana_status
# #failed_when: "'"title": "Green"' not in kibana_status.content"
# failed_when: "'Green' not in kibana_status.content"

- name: Health check
- name: Run Kibana checks
when: "'kibana' in group_names"
block:

- name: Fetch kibana.yml
ansible.builtin.command: cat /etc/kibana/kibana.yml
register: kibanayml

- name: Show kibana.yml
ansible.builtin.debug:
var: kibanayml.stdout_lines

- name: Check for Kibana port
ansible.builtin.wait_for:
port: 5601
timeout: 120

- name: Connect to Kibana
ansible.builtin.command:
curl
-s
-u elastic:{{ elastic_pass.stdout }}
http://{{ ansible_hostname }}:5601/api/status
register: curl_out
failed_when:
- "'green' not in curl_out.stdout"
- "'Elasticsearch is available' not in curl_out.stdout"

# The following might be nicer but doesn't work
#- name: Connect to Kibana
# ansible.builtin.uri:
# url: http://ansible-role-kibana_full_stack:5601/api/status
# user: elastic
# password: "{{ elastic_password.stdout }}"
# return_content: yes
# register: kibana_status
# #failed_when: "'"title": "Green"' not in kibana_status.content"
# failed_when: "'Green' not in kibana_status.content"

- name: Elasticsearch health check
ansible.builtin.uri:
url: https://localhost:{{ elasticstack_elasticsearch_http_port }}/_cluster/health
method: GET
Expand All @@ -110,7 +114,7 @@
delay: 10
when: groups['elasticsearch'] | length > 1

- name: Node check
- name: Elasticsearch Node check
ansible.builtin.uri:
url: https://localhost:{{ elasticstack_elasticsearch_http_port }}/_cat/nodes
method: GET
Expand Down
7 changes: 7 additions & 0 deletions roles/beats/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,10 @@
- name: Import Metricbeat tasks
ansible.builtin.import_tasks: metricbeat.yml
when: beats_metricbeat | bool

# Free up some space to let elsticsearch allocate replica in GitHub Action
- name: Remove cache
ansible.builtin.command: >
rm -rf /var/cache/*
changed_when: false
when: ansible_virtualization_type == "container" or ansible_virtualization_type == "docker"
6 changes: 6 additions & 0 deletions roles/elasticsearch/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ elasticsearch_cert_expiration_buffer: 30
elasticstack_ca_will_expire_soon: false
elasticsearch_cert_will_expire_soon: false

# only used internally
elasticsearch_freshstart:
changed: false
elasticsearch_freshstart_security:
changed: false

# "global" variables for all roles

elasticstack_release: 8
Expand Down
5 changes: 4 additions & 1 deletion roles/elasticsearch/handlers/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
name: elasticsearch
state: restarted
daemon_reload: yes
when: elasticsearch_enable | bool
when:
- elasticsearch_enable | bool
- not elasticsearch_freshstart.changed | bool
- not elasticsearch_freshstart_security.changed | bool

- name: Restart kibana if available for elasticsearch certificates
ansible.builtin.include_tasks: handlers/restart_kibana.yml
Expand Down
104 changes: 90 additions & 14 deletions roles/elasticsearch/tasks/elasticsearch-security.yml
Original file line number Diff line number Diff line change
Expand Up @@ -352,25 +352,31 @@
name: elasticsearch
state: started
enabled: yes
register: elasticsearch_freshstart_security

- name: Wait for all instances to start
ansible.builtin.include_tasks: wait_for_instance.yml
loop: "{{ groups['elasticsearch'] }}"

- name: Force all notified handlers to run at this point, not waiting for normal sync points
ansible.builtin.meta: flush_handlers
tags:
- certificates
- renew_ca
- renew_es_cert

- name: Wait for all instances to start
ansible.builtin.include_tasks: wait_for_instance.yml
loop: "{{ groups['elasticsearch'] }}"
tags:
- certificates
- renew_ca
- renew_es_cert
- name: Restart if Elasticsearch was already running
when:
- not elasticsearch_freshstart.changed | bool
- not elasticsearch_freshstart_security.changed | bool
block:
- name: Force all notified handlers to run at this point, not waiting for normal sync points
ansible.builtin.meta: flush_handlers
tags:
- certificates
- renew_ca
- renew_es_cert

- name: Wait for all instances to start
ansible.builtin.include_tasks: wait_for_instance.yml
loop: "{{ groups['elasticsearch'] }}"
tags:
- certificates
- renew_ca
- renew_es_cert

- name: Check for passwords being set
ansible.builtin.stat:
Expand All @@ -383,6 +389,25 @@
elasticsearch_http_protocol: "https"
when: elasticsearch_http_security

- name: Check for API with bootstrap password
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://localhost:{{ elasticstack_elasticsearch_http_port }}"
user: elastic
password: "{{ elasticsearch_bootstrap_pw }}"
validate_certs: false
register: elasticsearch_api_status_bootstrap
changed_when: false
no_log: "{{ elasticstack_no_log }}"
when:
- not elasticsearch_passwords_file.stat.exists | bool
- groups['elasticsearch'] | length > 1
until: elasticsearch_api_status_bootstrap.json.cluster_name is defined
retries: 5
delay: 10

# We need this check twice. One to wait for the API to be actually available. And a second time to
# check the actual return code. Should not cause a huge delay.

- name: Check for cluster status with bootstrap password
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://localhost:{{ elasticstack_elasticsearch_http_port }}/_cluster/health?pretty"
Expand Down Expand Up @@ -410,6 +435,57 @@
delegate_to: "{{ elasticstack_ca }}"
when: elasticsearch_passwords_file.stat.exists | bool

- name: Check for API availability with elastic password
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://localhost:{{ elasticstack_elasticsearch_http_port }}"
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: false
register: elasticsearch_api_status
changed_when: false
no_log: "{{ elasticstack_no_log }}"
when:
- elasticsearch_passwords_file.stat.exists | bool
- groups['elasticsearch'] | length > 1
until: elasticsearch_api_status.json.cluster_name is defined
retries: 20
delay: 10

- name: Work around low ressources on CI/CD nodes
when: ansible_virtualization_type == "container" or ansible_virtualization_type == "docker"
block:
# Free up some space to let elsticsearch allocate replica in GitHub Action
- name: Remove cache
ansible.builtin.command: >
rm -rf /var/cache/*
changed_when: false

- name: Set persistent watermarks to very high values in Docker # noqa: risky-shell-pipe
ansible.builtin.shell: >
if test -n "$(ps -p $$ | grep bash)"; then set -o pipefail; fi;
curl
-k
-X PUT
"{{ elasticsearch_http_protocol }}://elastic:{{ elasticstack_password.stdout }}@localhost:9200/_cluster/settings"
-H 'Content-Type: application/json' -d
'
{
"persistent": {
"cluster.routing.allocation.disk.watermark.low": "97%",
"cluster.routing.allocation.disk.watermark.high": "98%",
"cluster.routing.allocation.disk.watermark.flood_stage": "99%",
"cluster.routing.allocation.disk.watermark.flood_stage.frozen": "99%"
}
}
'
changed_when: false
no_log: "{{ elasticstack_no_log }}"
when:
- elasticstack_password.stdout is defined

# We need this check twice. One to wait for the API to be actually available. And a second time to
# check the actual return code. Should not cause a huge delay.

- name: Check for cluster status with elastic password
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://localhost:{{ elasticstack_elasticsearch_http_port }}/_cluster/health?pretty"
Expand Down
8 changes: 3 additions & 5 deletions roles/elasticsearch/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,8 @@
when: ansible_virtualization_type == "container" or ansible_virtualization_type == "docker"

# Free up some space to let elsticsearch allocate replica in GitHub Action
- name: Remove cache # noqa: risky-shell-pipe
ansible.builtin.shell: >
if test -n "$(ps -p $$ | grep bash)"; then set -o pipefail; fi;
- name: Remove cache
ansible.builtin.command: >
rm -rf /var/cache/*
changed_when: false
when: ansible_virtualization_type == "container" or ansible_virtualization_type == "docker"
Expand All @@ -200,6 +199,7 @@
name: elasticsearch
state: started
enabled: yes
register: elasticsearch_freshstart

- name: Handle cluster setup without security
when: not elasticsearch_security | bool
Expand Down Expand Up @@ -237,8 +237,6 @@
group: root
mode: 0644
backup: "{{ elasticsearch_config_backup }}"
notify:
- Restart Elasticsearch
when: elasticsearch_manage_yaml | bool

- name: Show Info about heap
Expand Down
3 changes: 3 additions & 0 deletions roles/kibana/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ kibana_cert_will_expire_soon: false
kibana_sniff_on_start: false
kibana_sniff_on_connection_fault: false

kibana_freshstart:
changed: false

# "global" variables for all roles
elasticstack_release: 8
elasticstack_full_stack: true
Expand Down
2 changes: 2 additions & 0 deletions roles/kibana/handlers/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@
ansible.builtin.service:
name: kibana
state: restarted
when:
- not kibana_freshstart.changed | bool
8 changes: 8 additions & 0 deletions roles/kibana/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
state: started
enabled: yes
when: kibana_enable | bool
register: kibana_freshstart

# the following is useful when running tests or extra tasks that need to
# have Kibana running. Escape it on Rocky8, because it gets time out with Elastic 8
Expand All @@ -90,3 +91,10 @@
ansible.builtin.wait_for:
host: localhost
port: 5601

# Free up some space to let elsticsearch allocate replica in GitHub Action
- name: Remove cache
ansible.builtin.command: >
rm -rf /var/cache/*
changed_when: false
when: ansible_virtualization_type == "container" or ansible_virtualization_type == "docker"
5 changes: 5 additions & 0 deletions roles/logstash/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ logstash_pipeline_identifier: true
logstash_pipeline_identifier_field_name: "[netways][pipeline]"
logstash_pipeline_identifier_defaults: false

# Only for internal use

logstash_freshstart:
changed: false

elasticstack_ca_dir: /opt/es-ca
elasticstack_initial_passwords: /usr/share/elasticsearch/initial_passwords
elasticstack_ca_pass: PleaseChangeMe
Expand Down
4 changes: 3 additions & 1 deletion roles/logstash/handlers/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
ansible.builtin.service:
name: logstash
state: restarted
when: logstash_enable | bool
when:
- logstash_enable | bool
- not logstash_freshstart.changed | bool

- name: Restart Logstash noauto
ansible.builtin.service:
Expand Down
8 changes: 8 additions & 0 deletions roles/logstash/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,11 @@
state: started
enabled: yes
when: logstash_enable | bool
register: logstash_freshstart

# Free up some space to let elsticsearch allocate replica in GitHub Action
- name: Remove cache
ansible.builtin.command: >
rm -rf /var/cache/*
changed_when: false
when: ansible_virtualization_type == "container" or ansible_virtualization_type == "docker"

0 comments on commit d5f7f54

Please sign in to comment.