#!/bin/bash set -e GREEN='\033[0;32m' YELLOW='\033[1;33m' RED='\033[0;31m' NC='\033[0m' echo -e "${YELLOW}=== Sittax Observability — Provisionamento de Agentes ===${NC}" echo -e "Instala exporters + OpenTelemetry Collector com OTLP/HTTP pro front da observability\n" # ============================================================================ # 1. Elevação de privilégio # ============================================================================ if [ "$EUID" -ne 0 ]; then SUDO="sudo" else SUDO="" fi # ============================================================================ # 2. Dependências # ============================================================================ echo -e "${YELLOW}[1/5] Instalando dependências...${NC}" $SUDO apt-get update -qq $SUDO apt-get install -y -qq ca-certificates curl jq > /dev/null # ============================================================================ # 3. Docker # ============================================================================ if ! command -v docker &> /dev/null; then echo -e "${YELLOW}[2/5] Instalando Docker Engine...${NC}" $SUDO install -m 0755 -d /etc/apt/keyrings $SUDO curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc $SUDO chmod a+r /etc/apt/keyrings/docker.asc echo \ "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ $SUDO tee /etc/apt/sources.list.d/docker.list > /dev/null $SUDO apt-get update -qq $SUDO apt-get install -y -qq docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin > /dev/null $SUDO systemctl enable docker $SUDO systemctl start docker else echo -e "${GREEN}[2/5] Docker já instalado ($(docker --version | cut -d' ' -f3 | tr -d ',')).${NC}" fi WORK_DIR="/opt/sittax/observability" $SUDO mkdir -p "$WORK_DIR" # ============================================================================ # 4. Configuração interativa # ============================================================================ echo -e "\n${YELLOW}[3/5] Configuração dos Agentes${NC}" read -p "Nome da instância (usado como prefixo nos jobs: -node, -docker) [prod]: " INSTANCE_NAME /dev/null || true fi # --- Rede externa config --- if [ -n "$APP_NETWORK" ]; then if ! $SUDO docker network ls --format '{{.Name}}' | grep -wq "$APP_NETWORK"; then echo -e " Criando rede externa '$APP_NETWORK'..." $SUDO docker network create "$APP_NETWORK" 2>/dev/null || true fi NET_EXT_BLOCK=" ${APP_NETWORK}: external: true" NET_CADVISOR=" - ${APP_NETWORK}" NET_EXPORTER=" - ${APP_NETWORK}" else NET_EXT_BLOCK="" NET_CADVISOR="" NET_EXPORTER="" fi # ---------- docker-compose.yml ---------- $SUDO tee "${WORK_DIR}/docker-compose.yml" > /dev/null << COMPOSE_EOF networks: monitoring: driver: bridge ${NET_EXT_BLOCK} services: # ============================================ # OpenTelemetry Collector — scrape dos exporters # e envio via OTLP/HTTP pro otel-front (otel.sittax.com.br) # ============================================ otel-collector: image: otel/opentelemetry-collector-contrib:latest container_name: otel-collector restart: unless-stopped networks: - monitoring volumes: - ./otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml:ro command: - '--config=/etc/otelcol-contrib/config.yaml' COMPOSE_EOF # --- Node Exporter --- if [[ "$USE_NODE" =~ ^[SsYy]$ ]]; then $SUDO tee -a "${WORK_DIR}/docker-compose.yml" > /dev/null << 'NODE_EOF' node-exporter: image: prom/node-exporter:latest container_name: node-exporter restart: unless-stopped privileged: true pid: host networks: - monitoring volumes: - /:/host:ro,rslave command: - '--path.rootfs=/host' - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)' - '--collector.filesystem.ignored-fs-types=^(autofs|proc|tmpfs|devtmpfs|sysfs|cgroup|overlay|nsfs|squashfs)$$' NODE_EOF fi # --- cAdvisor --- if [[ "$USE_CADVISOR" =~ ^[SsYy]$ ]]; then $SUDO tee -a "${WORK_DIR}/docker-compose.yml" > /dev/null << CADVISOR_EOF cadvisor: image: gcr.io/cadvisor/cadvisor:v0.49.1 container_name: cadvisor restart: unless-stopped privileged: true networks: - monitoring ${NET_CADVISOR} devices: - /dev/kmsg volumes: - /:/rootfs:ro - /var/run:/var/run:ro - /sys:/sys:ro - /var/lib/docker/:/var/lib/docker:ro - /dev/disk/:/dev/disk:ro command: - '--housekeeping_interval=15s' - '--max_housekeeping_interval=1m' - '--docker_only=true' # Habilita só o essencial pros dashboards (corta ~80% das séries). # Inclui oom_event pra detectar containers OOM-killed. - '--enable_metrics=cpu,cpuLoad,memory,network,oom_event' CADVISOR_EOF fi # --- RabbitMQ Exporter --- if [[ "$USE_RABBIT" =~ ^[SsYy]$ ]]; then $SUDO tee -a "${WORK_DIR}/docker-compose.yml" > /dev/null << RABBIT_EOF rabbitmq-exporter: image: kbudde/rabbitmq-exporter:latest container_name: rabbitmq-exporter restart: unless-stopped environment: INCLUDE_VHOST: '.*' RABBIT_USER: ${RABBIT_USER} RABBIT_PASSWORD: ${RABBIT_PASS} RABBIT_URL: ${RABBIT_URL} SKIP_VERIFY: 'true' networks: - monitoring ${NET_EXPORTER} RABBIT_EOF fi # --- Redis Exporter --- if [[ "$USE_REDIS" =~ ^[SsYy]$ ]]; then REDIS_ENV=" - REDIS_ADDR=${REDIS_ADDR}" if [ -n "$REDIS_PASS" ]; then REDIS_ENV="${REDIS_ENV}\n - REDIS_PASSWORD=${REDIS_PASS}" fi $SUDO tee -a "${WORK_DIR}/docker-compose.yml" > /dev/null << REDIS_EOF redis-exporter: image: oliver006/redis_exporter:latest container_name: redis-exporter restart: unless-stopped environment: $(echo -e "$REDIS_ENV") networks: - monitoring ${NET_EXPORTER} REDIS_EOF fi # --- Blackbox Exporter --- if [[ "$USE_BLACKBOX" =~ ^[SsYy]$ ]]; then $SUDO tee -a "${WORK_DIR}/docker-compose.yml" > /dev/null << 'BLACKBOX_EOF' blackbox-exporter: image: prom/blackbox-exporter:latest container_name: blackbox-exporter restart: unless-stopped networks: - monitoring BLACKBOX_EOF fi # --- Nginx Exporter --- if [[ "$USE_NGINX" =~ ^[SsYy]$ ]]; then $SUDO tee -a "${WORK_DIR}/docker-compose.yml" > /dev/null << NGINX_EOF nginx-exporter: image: nginx/nginx-prometheus-exporter:latest container_name: nginx-exporter restart: unless-stopped command: - '-nginx.scrape-uri=${NGINX_URL}' networks: - monitoring ${NET_EXPORTER} NGINX_EOF fi # ---------- otel-collector-config.yaml ---------- # Receiver prometheus mantém compatibilidade com scrape_configs do prometheus-agent. # Resource processor garante service.instance.id (vira label `instance` no Mimir) # e environment (consumido pelo transform/add_env do otel-back). $SUDO tee "${WORK_DIR}/otel-collector-config.yaml" > /dev/null << OTEL_HEADER receivers: prometheus: config: global: scrape_interval: ${SCRAPE_INTERVAL} external_labels: instance: ${INSTANCE_NAME} scrape_configs: OTEL_HEADER if [[ "$USE_NODE" =~ ^[SsYy]$ ]]; then $SUDO tee -a "${WORK_DIR}/otel-collector-config.yaml" > /dev/null << OTEL_NODE - job_name: '${INSTANCE_NAME}-node' static_configs: - targets: ['node-exporter:9100'] OTEL_NODE fi if [[ "$USE_CADVISOR" =~ ^[SsYy]$ ]]; then $SUDO tee -a "${WORK_DIR}/otel-collector-config.yaml" > /dev/null << OTEL_CADVISOR - job_name: '${INSTANCE_NAME}-docker' static_configs: - targets: ['cadvisor:8080'] OTEL_CADVISOR fi if [[ "$USE_RABBIT" =~ ^[SsYy]$ ]]; then $SUDO tee -a "${WORK_DIR}/otel-collector-config.yaml" > /dev/null << OTEL_RABBIT - job_name: '${INSTANCE_NAME}-rabbit' static_configs: - targets: ['rabbitmq-exporter:9419'] OTEL_RABBIT fi if [[ "$USE_REDIS" =~ ^[SsYy]$ ]]; then $SUDO tee -a "${WORK_DIR}/otel-collector-config.yaml" > /dev/null << OTEL_REDIS - job_name: '${INSTANCE_NAME}-redis' static_configs: - targets: ['redis-exporter:9121'] OTEL_REDIS fi if [[ "$USE_BLACKBOX" =~ ^[SsYy]$ ]]; then $SUDO tee -a "${WORK_DIR}/otel-collector-config.yaml" > /dev/null << OTEL_BB - job_name: '${INSTANCE_NAME}-blackbox' metrics_path: /probe params: module: [http_2xx] static_configs: - targets: ['${BB_TARGET}'] relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: blackbox-exporter:9115 OTEL_BB fi if [[ "$USE_NGINX" =~ ^[SsYy]$ ]]; then $SUDO tee -a "${WORK_DIR}/otel-collector-config.yaml" > /dev/null << OTEL_NGINX - job_name: '${INSTANCE_NAME}-nginx' static_configs: - targets: ['nginx-exporter:9113'] OTEL_NGINX fi # Processors + exporter OTLP/HTTP + pipeline. # - memory_limiter: protege o agent de OOM # - resource: marca service.instance.id (= label `instance` no Mimir após reconversão # no otel-back) e environment (consumido pelo transform/add_env do otel-back) # - batch: agrega antes de enviar $SUDO tee -a "${WORK_DIR}/otel-collector-config.yaml" > /dev/null << OTEL_TAIL processors: memory_limiter: check_interval: 1s limit_mib: 400 spike_limit_mib: 100 resource: attributes: - key: service.instance.id value: ${INSTANCE_NAME} action: upsert - key: environment value: ${INSTANCE_NAME} action: upsert batch: timeout: 2s send_batch_size: 5000 send_batch_max_size: 5500 exporters: otlphttp: endpoint: ${OTLP_ENDPOINT} compression: gzip tls: insecure_skip_verify: true sending_queue: enabled: true num_consumers: 5 queue_size: 5000 retry_on_failure: enabled: true initial_interval: 5s max_interval: 30s service: pipelines: metrics: receivers: [prometheus] processors: [memory_limiter, resource, batch] exporters: [otlphttp] telemetry: logs: level: info OTEL_TAIL # --- Docker Health Agent (sobe junto com o resto do compose) --- # Precisa rodar num host swarm manager pra conseguir `docker service ls`. # Avisa caso contrário, mas deixa o user decidir (pode ser worker com socket remoto). if [[ "$USE_HEALTH_AGENT" =~ ^[SsYy]$ ]]; then SWARM_STATE=$(docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null || echo "inactive") IS_MANAGER=$(docker info --format '{{.Swarm.ControlAvailable}}' 2>/dev/null || echo "false") if [ "$SWARM_STATE" != "active" ] || [ "$IS_MANAGER" != "true" ]; then echo -e "${YELLOW} ⚠️ Atenção: este host não parece ser swarm manager.${NC}" echo -e "${YELLOW} (state=$SWARM_STATE manager=$IS_MANAGER) — o agent vai falhar ao listar services.${NC}" fi $SUDO tee -a "${WORK_DIR}/docker-compose.yml" > /dev/null << HEALTH_EOF docker-health-agent: image: registry.sittax.com.br/sittax-docker-healthcheck:latest container_name: docker-health-agent restart: unless-stopped networks: - monitoring environment: - MONITOR_URL=${SENTINEL_URL} - STACKS_API_KEY=${STACKS_API_KEY} - HOST=${INSTANCE_NAME} - STACKS=${HEALTH_STACKS} - STACKS_EXCLUDE=${HEALTH_EXCLUDE} - INTERVAL_SECONDS=${HEALTH_INTERVAL} volumes: - /var/run/docker.sock:/var/run/docker.sock:ro mem_limit: 64m HEALTH_EOF fi # ============================================================================ # 6. Iniciar # ============================================================================ echo -e "\n${YELLOW}[5/5] Iniciando stack...${NC}" cd "$WORK_DIR" # Se vai subir o health-agent, garante que o registry privado está autenticado # pra puxar a imagem. Se não tiver login, avisa e continua (compose tenta puxar # e falha só nesse service). if [[ "$USE_HEALTH_AGENT" =~ ^[SsYy]$ ]]; then if ! $SUDO docker pull registry.sittax.com.br/sittax-docker-healthcheck:latest >/dev/null 2>&1; then echo -e "${YELLOW} ⚠️ Não consegui puxar a imagem do health-agent — provável falta de login.${NC}" echo -e "${YELLOW} Rode: docker login registry.sittax.com.br${NC}" echo -e "${YELLOW} Depois: docker compose -f ${WORK_DIR}/docker-compose.yml up -d docker-health-agent${NC}" fi fi $SUDO docker compose up -d echo "" echo -e "${GREEN}=== Observabilidade provisionada com sucesso ===${NC}" echo "" echo -e " Diretório: ${WORK_DIR}" echo -e " Instância: ${INSTANCE_NAME}" echo -e " OTLP endpoint: ${OTLP_ENDPOINT}" echo "" echo -e " Jobs criados (label \`job\` no Mimir):" [[ "$USE_NODE" =~ ^[SsYy]$ ]] && echo -e " - ${INSTANCE_NAME}-node" [[ "$USE_CADVISOR" =~ ^[SsYy]$ ]] && echo -e " - ${INSTANCE_NAME}-docker" [[ "$USE_RABBIT" =~ ^[SsYy]$ ]] && echo -e " - ${INSTANCE_NAME}-rabbit" [[ "$USE_REDIS" =~ ^[SsYy]$ ]] && echo -e " - ${INSTANCE_NAME}-redis" [[ "$USE_BLACKBOX" =~ ^[SsYy]$ ]] && echo -e " - ${INSTANCE_NAME}-blackbox" [[ "$USE_NGINX" =~ ^[SsYy]$ ]] && echo -e " - ${INSTANCE_NAME}-nginx" echo "" if [[ "$USE_HEALTH_AGENT" =~ ^[SsYy]$ ]]; then echo -e " ${GREEN}Docker Health Agent${NC}: reportando como host='${INSTANCE_NAME}' pro Sentinel" if [ -n "$HEALTH_STACKS" ]; then echo -e " stacks (allowlist): ${HEALTH_STACKS}" else echo -e " stacks: descoberta automática (todas)" fi echo -e " exclui: ${HEALTH_EXCLUDE:-(nenhuma)}" echo -e " logs: docker logs -f docker-health-agent" echo "" fi echo -e " Dashboard: https://grafana.sittax.com.br" echo -e " Logs do agent: docker logs -f otel-collector" echo "" echo -e " Para reconfigurar: edite ${WORK_DIR}/otel-collector-config.yaml e rode:" echo -e " docker compose -f ${WORK_DIR}/docker-compose.yml restart otel-collector" echo ""