7.2.10.1.2. Configuration / fichiers utiles

7.2.10.1.2.1. Fichier de configuration

prometheus.yml

# my global config
global:
  scrape_interval:     {{ prometheus.server.scrape_interval | default(15) }}s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: {{ prometheus.server.evaluation_interval | default(15) }}s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    # - alertmanager:9093
{% for host in groups['hosts_alertmanager'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ prometheus.alertmanager.api_port | default(9093) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% endfor %}

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  # - rule.yml
{% for item in rules_files.files %}
  - {{ item.path }}
{% endfor %}

scrape_configs:
{% if prometheus.node_exporter.enabled | default(true) | bool == true %}
  - job_name: vitam-node-exporter
    metrics_path: {{ prometheus.node_exporter.metrics_path | default('/metrics') }}
    static_configs:
{% for host in groups['vitam'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ prometheus.node_exporter.port | default(9101) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% if host in groups['hosts_elasticsearch_data'] %}
        elastic_cluster_name: "{{ elasticsearch.data.cluster_name }}"
{% elif host in groups['hosts_elasticsearch_log'] %}
        elastic_cluster_name: "{{ elasticsearch.log.cluster_name }}"
{% endif %}
{% endfor %}
{% endif %}

{% if prometheus.consul_exporter.enabled | default(true) | bool == true %}
  - job_name: vitam-consul-exporter
    metrics_path: {{ prometheus.consul_exporter.metrics_path | default('/metrics') }}
    static_configs:
{% for host in groups['vitam'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ prometheus.consul_exporter.port | default(9107) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}

{% if prometheus.elasticsearch_exporter.enabled | default(true) | bool == true %}
  - job_name: vitam-elasticsearch-exporter
    metrics_path: {{ prometheus.elasticsearch_exporter.metrics_path | default('/metrics') }}
    static_configs:
{% for host in groups['elasticsearch'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ prometheus.elasticsearch_exporter.port | default(9114) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}

{% if (groups['hosts_access_internal']|length >0) %}
  - job_name: vitam-access-internal
    metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
    static_configs:
{% for host in groups['hosts_access_internal'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ vitam.accessinternal.port_admin | default(28101) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}

{% if (groups['hosts_access_external']|length >0) %}
  - job_name: vitam-access-external
    metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
    static_configs:
{% for host in groups['hosts_access_external'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ vitam.accessexternal.port_admin | default(28102) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}

{% if (groups['hosts_ingest_internal']|length >0) %}
  - job_name: vitam-ingest-internal
    metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
    static_configs:
{% for host in groups['hosts_ingest_internal'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ vitam.ingestinternal.port_admin | default(28100) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}

{% if (groups['hosts_ingest_external']|length >0) %}
  - job_name: vitam-ingest-external
    metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
    static_configs:
{% for host in groups['hosts_ingest_external'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ vitam.ingestexternal.port_admin | default(28001) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}

{% if (groups['hosts_metadata']|length >0) %}
  - job_name: vitam-metadata
    metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
    static_configs:
{% for host in groups['hosts_metadata'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ vitam.metadata.port_admin | default(28200) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}

{% if (groups['hosts_ihm_demo']|length >0) %}
  - job_name: vitam-ihm-demo
    metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
    static_configs:
{% for host in groups['hosts_ihm_demo'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ vitam.ihm_demo.port_admin | default(28002) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}

{% if (groups['hosts_ihm_recette']|length >0) %}
  - job_name: vitam-ihm-recette
    metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
    static_configs:
{% for host in groups['hosts_ihm_recette'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ vitam.ihm_recette.port_admin | default(28204) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}

{% if (groups['hosts_logbook']|length >0) %}
  - job_name: vitam-logbook
    metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
    static_configs:
{% for host in groups['hosts_logbook'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ vitam.logbook.port_admin | default(29002) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}

{% if (groups['hosts_workspace']|length >0) %}
  - job_name: vitam-workspace
    metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
    static_configs:
{% for host in groups['hosts_workspace'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ vitam.workspace.port_admin | default(28201) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}

{% if (groups['hosts_processing']|length >0) %}
  - job_name: vitam-processing
    metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
    static_configs:
{% for host in groups['hosts_processing'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ vitam.processing.port_admin | default(28203) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}

{% if (groups['hosts_worker']|length >0) %}
  - job_name: vitam-worker
    metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
    static_configs:
{% for host in groups['hosts_worker'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ vitam.worker.port_admin | default(29104) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}

{% if (groups['hosts_storage_engine']|length >0) %}
  - job_name: vitam-storage-engine
    metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
    static_configs:
{% for host in groups['hosts_storage_engine'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ vitam.storageengine.port_admin | default(29102) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}

{% if (groups['hosts_storage_offer_default']|length >0) %}
{% set offerInstances = [] %}
{% for host in groups['hosts_storage_offer_default'] %}
{{ offerInstances.append({"offerId": hostvars[host]['offer_conf'], "host": host }) }}
{% endfor %}
  - job_name: vitam-storage-offer-default
    metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
    static_configs:
{% for offerId, hosts in offerInstances | groupby('offerId') %}
{% for host in hosts %}
    - targets:
      - {{ hostvars[host.host]['ip_admin'] }}:{{ vitam.storageofferdefault.port_admin | default(29900) }}
      labels:
        offerId: {{ offerId }}
        hostname: "{{ host.host }}"
{% endfor %}
{% endfor %}
{% endif %}

{% if (groups['hosts_functional_administration']|length >0) %}
  - job_name: vitam-functional-administration
    metrics_path: {{ prometheus.metrics_path | default('/admin/v1/metrics') }}
    static_configs:
{% for host in groups['hosts_functional_administration'] %}
    - targets:
      - {{ hostvars[host]['ip_admin'] }}:{{ vitam.functional_administration.port_admin | default(18004) }}
      labels:
        hostname: "{{ host.split(".")[0] }}"
{% endfor %}
{% endif %}

7.2.10.1.2.2. Génération du fichier de configuration

Dans le cas où un serveur Prometheus est déjà en place, il est possible de générer le fichier de configuration prometheus.yml depuis l’inventaire de l’environnement de la solution VITAM.

Pour se faire, depuis la machine Ansible, se placer dans le répertoire path_to/vitam/deploiement/ et exécuter la ligne de commande suivante :

# Spécifier le répertoire de sortie dans le fichier cots_var.yml {{ prometheus.prometheus_config_file_target_directory: path_dir_output }}
ansible-playbook ansible-vitam-extra/prometheus.yml -i environments/hosts.<environnement> --ask-vault-pass --tags gen_prometheus_config

Le fichier de configuration est alors généré dans le répertoire de sortie avec le nom prometheus.yml. Il suffit de récupérer les parties nécessaires, comme par exemple scrape_configs, et de les intégrer à la configuration du serveur Prometheus déjà existant.

Avertissement

Les flux réseau entre le serveur Prometheus existant et les différents machines hébergeant le solution VITAM doivent être ouverts sur la patte d’administration.

7.2.10.1.2.3. Fichier de variable d’environnement

PROMETHEUS_OPTS='--web.listen-address={{ ip_admin }}:{{ prometheus.server.port | default(9090) }} --web.external-url=http://{{ ip_admin }}:{{ prometheus.server.port | default(9090) }}/prometheus --config.file=/vitam/conf/prometheus/prometheus.yml --storage.tsdb.path=/vitam/data/prometheus --storage.tsdb.retention.time={{ prometheus.server.tsdb_retention_time | default("7d") }} --storage.tsdb.retention.size={{ prometheus.server.tsdb_retention_size | default("5GB") }}'
# Following params can be added
# --web.enable-admin-api
# --web.page-title
# --web.cors.origin
# --web.route-prefix

7.2.10.1.2.4. Fichiers de données

Ce service utilise des fichiers de données localisés dans le répertoire /vitam/data/prometheus/

7.2.10.1.2.5. Règles livrées avec la solution VITAM

7.2.10.1.2.5.1. Etat de la machine

  • Remonte une alerte critique si une machine n’est pas joignable depuis plus d’une minute
groups:
  - name: state
    rules:
      - alert: InstanceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ $labels.hostname }} down"
          description: "Instance {{ $labels.hostname }} has been down for more than 1 minute"

7.2.10.1.2.5.2. Processeur

  • Remonte une alerte d’avertissement si la consommation globale du processeur d’une machine est à plus de 75% depuis 10 minutes
  • Remonte une alerte critique si la consommation globale du processeur d’une machine est à plus de 90% depuis 10 minutes
groups:
  - name: cpu
    rules:
      - alert: HighCPUUsage
        expr: (100 - (avg by (instance) (rate(node_cpu_seconds_total{job="vitam-node-exporter",mode="idle"}[1m])) * 100)) > 75
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.hostname }} has high CPU usage"
          description: "Instance {{ $labels.hostname }} has been using at least 75% of its CPU for more than 10 minutes"
      - alert: HighCPUUsage
        expr: (100 - (avg by (instance) (rate(node_cpu_seconds_total{job="vitam-node-exporter",mode="idle"}[1m])) * 100)) > 90
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ $labels.hostname }} has critically high CPU usage"
          description: "Instance {{ $labels.hostname }} has been using at least 90% of its CPU for more than 10 minutes"

7.2.10.1.2.5.3. Mémoire

  • Remonte une alerte d’avertissement si la consommation de la mémoire d’une machine est à plus de 75% depuis 10 minutes
  • Remonte une alerte critique si la consommation de la mémoire d’une machine est à plus de 90% depuis 10 minutes
groups:
  - name: memory
    rules:
      - alert: HighMemoryUsage
        expr: (100 - ((node_memory_MemAvailable_bytes * 100) / node_memory_MemTotal_bytes)) > 75
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.hostname }} has high memory usage"
          description: "Instance {{ $labels.hostname }} has been using at least 75% of its RAM for more than 10 minutes"
      - alert: HighMemoryUsage
        expr: (100 - ((node_memory_MemAvailable_bytes * 100) / node_memory_MemTotal_bytes)) > 90
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ $labels.hostname }} has critically high memory usage"
          description: "Instance {{ $labels.hostname }} has been using at least 90% of its RAM for more than 10 minutes"

7.2.10.1.2.5.4. Disque

  • Remonte une alerte d’avertissement si la consommation de l’espace disque de la partition root d’une machine est à plus de 75% depuis 10 minutes
  • Remonte une alerte d’avertissement si la consommation de l’espace disque de la partition Vitam d’une machine est à plus de 75% depuis 10 minutes
  • Remonte une alerte critique si la consommation de l’espace disque de la partition root d’une machine est à plus de 90% depuis 10 minutes
  • Remonte une alerte critique si la consommation de l’espace disque de la partition Vitam d’une machine est à plus de 90% depuis 10 minutes
groups:
  - name: disk_root
    rules:
      - alert: OutOfDiskSpace
        expr: (100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"})) > 75
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.hostname }} is running out of disk space"
          description: "Instance {{ $labels.hostname }} has its root partition 75% full since 10 minutes"
      - alert: OutOfDiskSpace
        expr: (100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"})) > 90
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ $labels.hostname }} is critically running out of disk space"
          description: "Instance {{ $labels.hostname }} has its root partition 90% full since 10 minutes"
  - name: disk_vitam
    rules:
      - alert: OutOfDiskSpace
        expr: (100 - ((node_filesystem_avail_bytes{mountpoint="/vitam"} * 100) / node_filesystem_size_bytes{mountpoint="/vitam"})) > 75
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.hostname }} is running out of disk space"
          description: "Instance {{ $labels.hostname }} has its Vitam partition 75% full since 10 minutes"
      - alert: OutOfDiskSpace
        expr: (100 - ((node_filesystem_avail_bytes{mountpoint="/vitam"} * 100) / node_filesystem_size_bytes{mountpoint="/vitam"})) > 90
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ $labels.hostname }} is critically running out of disk space"
          description: "Instance {{ $labels.hostname }} has its Vitam partition 90% full since 10 minutes"

7.2.10.1.2.6. Ajout de nouvelles règles

Afin d’ajouter de nouvelles règles, il suffit de déposer les nouveaux fichiers de règles dans le répértoire suivant : deployment/ansible-vitam-extra/roles/prometheus-server/rules/

Puis d’exécuter la commande suivante :

ansible-playbook ansible-vitam-extra/prometheus.yml -i environments/hosts.<environnement> --ask-vault-pass