7.2.10.1.2. Configuration / fichiers utiles¶
7.2.10.1.2.1. Fichier de configuration¶
prometheus.yml
# my global config
global:
scrape_interval: {{ prometheus.server.scrape_interval | default(15) }}s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: {{ prometheus.server.evaluation_interval | default(15) }}s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
# - alertmanager:9093
{% for host in groups['hosts_alertmanager'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ prometheus.alertmanager.api_port | default(9093) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - rule.yml
{% for item in rules_files.files %}
- {{ item.path }}
{% endfor %}
scrape_configs:
{% if prometheus.node_exporter.enabled | default(true) | bool == true %}
- job_name: vitam-node-exporter
metrics_path: {{ prometheus.node_exporter.metrics_path | default('/metrics') }}
static_configs:
{% for host in groups['vitam'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ prometheus.node_exporter.port | default(9101) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% if host in groups['hosts_elasticsearch_data'] %}
elastic_cluster_name: "{{ elasticsearch.data.cluster_name }}"
{% elif host in groups['hosts_elasticsearch_log'] %}
elastic_cluster_name: "{{ elasticsearch.log.cluster_name }}"
{% endif %}
{% endfor %}
{% endif %}
{% if prometheus.consul_exporter.enabled | default(true) | bool == true %}
- job_name: vitam-consul-exporter
metrics_path: {{ prometheus.consul_exporter.metrics_path | default('/metrics') }}
static_configs:
{% for host in groups['vitam'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ prometheus.consul_exporter.port | default(9107) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if prometheus.elasticsearch_exporter.enabled | default(true) | bool == true %}
- job_name: vitam-elasticsearch-exporter
metrics_path: {{ prometheus.elasticsearch_exporter.metrics_path | default('/metrics') }}
static_configs:
{% for host in groups['elasticsearch'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ prometheus.elasticsearch_exporter.port | default(9114) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_access_internal']|length >0) %}
- job_name: vitam-access-internal
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_access_internal'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.accessinternal.port_admin | default(28101) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_access_external']|length >0) %}
- job_name: vitam-access-external
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_access_external'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.accessexternal.port_admin | default(28102) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_ingest_internal']|length >0) %}
- job_name: vitam-ingest-internal
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_ingest_internal'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.ingestinternal.port_admin | default(28100) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_ingest_external']|length >0) %}
- job_name: vitam-ingest-external
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_ingest_external'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.ingestexternal.port_admin | default(28001) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_metadata']|length >0) %}
- job_name: vitam-metadata
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_metadata'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.metadata.port_admin | default(28200) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_ihm_demo']|length >0) %}
- job_name: vitam-ihm-demo
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_ihm_demo'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.ihm_demo.port_admin | default(28002) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_ihm_recette']|length >0) %}
- job_name: vitam-ihm-recette
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_ihm_recette'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.ihm_recette.port_admin | default(28204) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_logbook']|length >0) %}
- job_name: vitam-logbook
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_logbook'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.logbook.port_admin | default(29002) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_workspace']|length >0) %}
- job_name: vitam-workspace
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_workspace'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.workspace.port_admin | default(28201) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_processing']|length >0) %}
- job_name: vitam-processing
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_processing'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.processing.port_admin | default(28203) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_worker']|length >0) %}
- job_name: vitam-worker
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_worker'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.worker.port_admin | default(29104) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_storage_engine']|length >0) %}
- job_name: vitam-storage-engine
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_storage_engine'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.storageengine.port_admin | default(29102) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_storage_offer_default']|length >0) %}
{% set offerInstances = [] %}
{% for host in groups['hosts_storage_offer_default'] %}
{{ offerInstances.append({"offerId": hostvars[host]['offer_conf'], "host": host }) }}
{% endfor %}
- job_name: vitam-storage-offer-default
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for offerId, hosts in offerInstances | groupby('offerId') %}
{% for host in hosts %}
- targets:
- {{ hostvars[host.host]['ip_admin'] }}:{{ vitam.storageofferdefault.port_admin | default(29900) }}
labels:
offerId: {{ offerId }}
hostname: "{{ host.host }}"
{% endfor %}
{% endfor %}
{% endif %}
{% if (groups['hosts_functional_administration']|length >0) %}
- job_name: vitam-functional-administration
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_functional_administration'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.functional_administration.port_admin | default(18004) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_metadata_collect']|length >0) %}
- job_name: vitam-metadata-collect
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_metadata_collect'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.metadata_collect.port_admin | default(28290) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_workspace_collect']|length >0) %}
- job_name: vitam-workspace-collect
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_workspace_collect'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.workspace_collect.port_admin | default(28291) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_collect_internal']|length >0) %}
- job_name: vitam-collect-internal
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_collect_internal'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.collect_internal.port_admin | default(28038) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_collect_external']|length >0) %}
- job_name: vitam-collect-external
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_collect_external'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.collect_external.port_admin | default(28030) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
{% if (groups['hosts_scheduler']|length >0) %}
- job_name: vitam-scheduler
metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
static_configs:
{% for host in groups['hosts_scheduler'] %}
- targets:
- {{ hostvars[host]['ip_admin'] }}:{{ vitam.scheduler.port_admin | default(28799) }}
labels:
hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}
7.2.10.1.2.2. Génération du fichier de configuration¶
Dans le cas où un serveur Prometheus est déjà en place, il est possible de générer le fichier de configuration prometheus.yml
depuis l’inventaire de l’environnement de la solution VITAM.
Pour se faire, depuis la machine Ansible, se placer dans le répertoire path_to/vitam/deploiement/
et exécuter la ligne de commande suivante :
# Spécifier le répertoire de sortie dans le fichier cots_var.yml {{ prometheus.prometheus_config_file_target_directory: path_dir_output }}
ansible-playbook ansible-vitam-extra/prometheus.yml -i environments/hosts.<environnement> --ask-vault-pass --tags gen_prometheus_config
Le fichier de configuration est alors généré dans le répertoire de sortie avec le nom prometheus.yml
. Il suffit de récupérer les parties nécessaires, comme par exemple scrape_configs
, et de les intégrer à la configuration du serveur Prometheus déjà existant.
Avertissement
Les flux réseau entre le serveur Prometheus existant et les différents machines hébergeant le solution VITAM doivent être ouverts sur la patte d’administration.
7.2.10.1.2.3. Fichier de variable d’environnement¶
PROMETHEUS_OPTS='--web.listen-address={{ ip_admin }}:{{ prometheus.server.port | default(9090) }} --web.external-url=http://{{ ip_admin }}:{{ prometheus.server.port | default(9090) }}/prometheus --config.file=/vitam/conf/prometheus/prometheus.yml --storage.tsdb.path=/vitam/data/prometheus --storage.tsdb.retention.time={{ prometheus.server.tsdb_retention_time | default("7d") }} --storage.tsdb.retention.size={{ prometheus.server.tsdb_retention_size | default("5GB") }}'
# Following params can be added
# --web.enable-admin-api
# --web.page-title
# --web.cors.origin
# --web.route-prefix
7.2.10.1.2.4. Fichiers de données¶
Ce service utilise des fichiers de données localisés dans le répertoire /vitam/data/prometheus/
7.2.10.1.2.5. Règles livrées avec la solution VITAM¶
7.2.10.1.2.5.1. Etat de la machine¶
- Remonte une alerte critique si une machine n’est pas joignable depuis plus d’une minute
groups:
- name: state
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.hostname }} down"
description: "Instance {{ $labels.hostname }} has been down for more than 1 minute"
7.2.10.1.2.5.2. Processeur¶
- Remonte une alerte d’avertissement si la consommation globale du processeur d’une machine est à plus de 75% depuis 10 minutes
- Remonte une alerte critique si la consommation globale du processeur d’une machine est à plus de 90% depuis 10 minutes
groups:
- name: cpu
rules:
- alert: HighCPUUsage
expr: (100 - (avg by (instance) (rate(node_cpu_seconds_total{job="vitam-node-exporter",mode="idle"}[1m])) * 100)) > 75
for: 10m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.hostname }} has high CPU usage"
description: "Instance {{ $labels.hostname }} has been using at least 75% of its CPU for more than 10 minutes"
- alert: HighCPUUsage
expr: (100 - (avg by (instance) (rate(node_cpu_seconds_total{job="vitam-node-exporter",mode="idle"}[1m])) * 100)) > 90
for: 10m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.hostname }} has critically high CPU usage"
description: "Instance {{ $labels.hostname }} has been using at least 90% of its CPU for more than 10 minutes"
7.2.10.1.2.5.3. Mémoire¶
- Remonte une alerte d’avertissement si la consommation de la mémoire d’une machine est à plus de 75% depuis 10 minutes
- Remonte une alerte critique si la consommation de la mémoire d’une machine est à plus de 90% depuis 10 minutes
groups:
- name: memory
rules:
- alert: HighMemoryUsage
expr: (100 - ((node_memory_MemAvailable_bytes * 100) / node_memory_MemTotal_bytes)) > 75
for: 10m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.hostname }} has high memory usage"
description: "Instance {{ $labels.hostname }} has been using at least 75% of its RAM for more than 10 minutes"
- alert: HighMemoryUsage
expr: (100 - ((node_memory_MemAvailable_bytes * 100) / node_memory_MemTotal_bytes)) > 90
for: 10m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.hostname }} has critically high memory usage"
description: "Instance {{ $labels.hostname }} has been using at least 90% of its RAM for more than 10 minutes"
7.2.10.1.2.5.4. Disque¶
- Remonte une alerte d’avertissement si la consommation de l’espace disque de la partition root d’une machine est à plus de 75% depuis 10 minutes
- Remonte une alerte d’avertissement si la consommation de l’espace disque de la partition Vitam d’une machine est à plus de 75% depuis 10 minutes
- Remonte une alerte critique si la consommation de l’espace disque de la partition root d’une machine est à plus de 90% depuis 10 minutes
- Remonte une alerte critique si la consommation de l’espace disque de la partition Vitam d’une machine est à plus de 90% depuis 10 minutes
groups:
- name: disk_root
rules:
- alert: OutOfDiskSpace
expr: (100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"})) > 75
for: 10m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.hostname }} is running out of disk space"
description: "Instance {{ $labels.hostname }} has its root partition 75% full since 10 minutes"
- alert: OutOfDiskSpace
expr: (100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"})) > 90
for: 10m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.hostname }} is critically running out of disk space"
description: "Instance {{ $labels.hostname }} has its root partition 90% full since 10 minutes"
- name: disk_vitam
rules:
- alert: OutOfDiskSpace
expr: (100 - ((node_filesystem_avail_bytes{mountpoint="/vitam"} * 100) / node_filesystem_size_bytes{mountpoint="/vitam"})) > 75
for: 10m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.hostname }} is running out of disk space"
description: "Instance {{ $labels.hostname }} has its Vitam partition 75% full since 10 minutes"
- alert: OutOfDiskSpace
expr: (100 - ((node_filesystem_avail_bytes{mountpoint="/vitam"} * 100) / node_filesystem_size_bytes{mountpoint="/vitam"})) > 90
for: 10m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.hostname }} is critically running out of disk space"
description: "Instance {{ $labels.hostname }} has its Vitam partition 90% full since 10 minutes"
7.2.10.1.2.6. Ajout de nouvelles règles¶
Afin d’ajouter de nouvelles règles, il suffit de déposer les nouveaux fichiers de règles dans le répértoire suivant : deployment/ansible-vitam-extra/roles/prometheus-server/rules/
Puis d’exécuter la commande suivante :
ansible-playbook ansible-vitam-extra/prometheus.yml -i environments/hosts.<environnement> --ask-vault-pass