adding some base for monitoring, before adding prometheus and the rest

2018-06-09 10:29:15 +02:00
commit 9487fb10aa
8 changed files with 10657 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,73 @@
+# swp vim edits
+*.swp
+*.swo
+.os_release.sh
+.configuration.cluster  
+.configuration.installs
+.awscli-bundle/
+
+# Python
+*.pyc
+**/.cache
+**/.coverage
+
+# Eclipse related files
+.classpath
+.cproject
+.deps
+.project
+.settings
+.metadata/
+
+# MAC DS files
+.DS_Store
+
+# STS
+.springBeans
+# Maven and build related
+target/
+packages.mk
+
+# IntelliJ IDEA related files
+*.iml
+*.iws
+*.ipr
+.idea/
+
+.sonar-ide.properties
+
+
+*.orig
+*rebel*.xml
+.idea/
+
+/INSTALL
+/Makefile
+/aclocal.m4
+/autom4te.cache/
+/build/aux/
+/config.log
+/config.status
+/configure
+
+# sed related files
+**/*.bak_remove
+
+# terraform state files
+**/.terraform
+**/*.tfstate
+**/*.tfstate.backup
+**/.terraform.tfstate.lock.info
+
+# Folders and files
+certificates
+terraform
+configuration.cluster
+configuration.installs
+other
+ec2
+monitoring.backup
+monitoring/monitoring_*_dir
+**/*.bak_remove
+**/*.wrapped
+data
--- a/README.md
+++ b/README.md
@@ -0,0 +1 @@
+# monitoring-grafana-influxdb-telegraf-prometheus
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,54 @@
+version: '3'
+
+networks:
+  public: {}
+
+volumes:
+  grafana_lib: {}
+  grafana_ds: 
+
+services:
+  influxdb:
+    image: influxdb:alpine
+    container_name: influxdb
+    ports:
+      - "8086:8086"
+    networks:
+      - public
+    volumes:
+      - ./data/influxdb:/var/lib/influxdb
+    environment:
+      INFLUXDB_REPORTING_DISABLED: "true"
+      INFLUXDB_DB: telegraf
+      INFLUXDB_USER: telegraf
+      INFLUXDB_USER_PASSWORD: nimda
+
+  grafana:
+      image: grafana/grafana:5.1.3
+      container_name: grafana
+      ports:
+        - "3000:3000"
+      networks:
+        - public
+      volumes:
+        - grafana_lib:/var/lib/grafana
+        - grafana_ds:/var/lib/grafana/ds:rw
+        - ${PWD}/grafana/add_datasources.sh:/var/lib/grafana/ds/add_datasources.sh
+      environment:
+        GF_AUTH_ANONYMOUS_ENABLED: "true"
+        GF_AUTH_ANONYMOUS_ORG_ROLE: "Admin"
+        INFLUXDB_URI: "http://influxdb:8086"
+        INFLUXDB_DB: telegraf
+        INFLUXDB_USER: telegraf
+        INFLUXDB_USER_PASSWORD: nimda
+      command: ["bash", "/var/lib/grafana/ds/add_datasources.sh"]
+        
+  telegraf:
+    image: telegraf:latest
+    container_name: telegraf
+    network_mode: "host"
+    volumes:
+      - ./telegraf.conf:/etc/telegraf/telegraf.conf:ro
+    environment:
+      # real influx host
+      INFLUXDB_URI: "http://localhost:8086"
--- a/grafana/add_dashboards.sh
+++ b/grafana/add_dashboards.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+set -e
+
+NC='\033[0m'
+
+RED='\033[00;31m'
+GREEN='\033[00;32m'
+YELLOW='\033[00;33m'
+BLUE='\033[00;34m'
+PURPLE='\033[00;35m'
+CYAN='\033[00;36m'
+LIGHTGRAY='\033[00;37m'
+MAGENTA='\033[00;35m'
+LRED='\033[01;31m'
+LGREEN='\033[01;32m'
+LYELLOW='\033[01;33m'
+LBLUE='\033[01;34m'
+LPURPLE='\033[01;35m'
+LCYAN='\033[01;36m'
+WHITE='\033[01;37m'
+
+GRAFANA_URL=http://admin:admin@localhost:3000
+
+
+grafana_api() {
+  local verb=$1
+  local url=$2
+  local params=$3
+  local bodyfile=$4
+  local response
+  local cmd
+
+  cmd="curl -L -s --fail -H \"Accept: application/json\" -H \"Content-Type: application/json\" -X ${verb} -k ${GRAFANA_URL}${url}"
+  [[ -n "${params}" ]] && cmd="${cmd} -d \"${params}\""
+  [[ -n "${bodyfile}" ]] && cmd="${cmd} --data @${bodyfile}"
+  echo -e "Running ${cmd}"
+  eval ${cmd} || return 1
+  return 0
+}
+
+wait_for_api() {
+  echo -e "${BLUE}Waiting for Grafana to be available...${NC}"
+  while ! grafana_api GET /api/user/preferences
+  do
+    echo -e "${BLUE}Waiting still...${NC}"
+    sleep 15
+  done
+}
+
+replace_datasource() {
+   local dashboard_file=$1
+   local datasource_name=$2
+   cmd="sed -i.bak_remove \"s/\\\${DS_INFLUXDB}/${datasource_name}/g\" ${dashboard_file}"
+   eval ${cmd} || return 1
+   return 0
+}
+
+install_dashboards() {
+  local dashboard
+
+  for dashboard in dashboards/*.json
+
+  do
+  if [[ $(grep "\"name\": \"DS_INFLUXDB\"," ${dashboard}) ]]; then
+    echo -e "${PURPLE}Dashboard ${dashboard} seems to be for InfluxDB datasource${NC}"
+    datasource_name="influxdb"
+  fi 
+  if [[ $(grep "\"name\": \"DS_PROMETHEUS\"," ${dashboard}) ]]; then
+    echo -e "${PURPLE}Dashboard ${dashboard} seems to be for Prometheus datasource${NC}"
+    datasource_name="prometheus"
+  fi 
+    if [[ -f "${dashboard}" ]]; then
+      echo -e "${LCYAN}Installing dashboard ${dashboard}${NC}"
+      replace_datasource ${dashboard} ${datasource_name}
+      # backup will be created before wrapping dashboard ^
+      #echo -e "{\"dashboard\": `cat $dashboard`}" > "${dashboard}.wrapped"
+      cp ${dashboard} ${dashboard}.wrapped
+      sed -i '1s/^/{"dashboard":\n/' ${dashboard}.wrapped
+      echo "}" >> ${dashboard}.wrapped
+
+      if grafana_api POST /api/dashboards/db "" "${dashboard}.wrapped"; then
+        echo -e "\n** ${GREEN}installed ok **${NC}"
+      else
+        echo -e "\n** ${RED}installation of: ${PURPLE}\"${dashboard}\"${RED} failed **${NC}"
+      fi
+    fi
+  #rm ${dashboard}.wrapped
+  done
+}
+
+configure_grafana() {
+  wait_for_api
+  install_dashboards
+}
+
+configure_grafana
+
--- a/grafana/add_datasources.sh
+++ b/grafana/add_datasources.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+#set -e
+
+# ADD INFLUXDB DATASOURCE
+curl -s -v -H "Content-Type: application/json" \
+    -XPOST http://admin:admin@localhost:3000/api/datasources \
+    -d @- <<EOF
+{
+    "name": "influxdb",
+    "type": "influxdb",
+    "access": "proxy",
+    "url": "http://influxdb:8086",
+    "database": "telegraf",
+    "user":"telegraf",
+    "password":"nimda",
+    "basicAuth":false
+}
+EOF
+
+## ADD PROMETHEUS DATASOURCE
+curl -s -v -H "Content-Type: application/json" \
+    -XPOST http://admin:admin@localhost:3000/api/datasources \
+    -d @- <<EOF
+{
+    "name": "prometheus",
+    "type": "prometheus",
+    "access": "proxy",
+    "url": "http://prometheus:9090"
+}
+EOF
+
--- a/grafana/dashboards/prometheus-system.json
+++ b/grafana/dashboards/prometheus-system.json
--- a/grafana/dashboards/telegraf-dashboard.json
+++ b/grafana/dashboards/telegraf-dashboard.json
--- a/telegraf.conf
+++ b/telegraf.conf
@@ -0,0 +1,148 @@
+[agent]
+## Default data collection interval for all inputs
+interval = "10s"
+## Rounds collection interval to 'interval'
+## ie, if interval="10s" then always collect on :00, :10, :20, etc.
+round_interval = true
+## Telegraf will send metrics to outputs in batches of at
+## most metric_batch_size metrics.
+metric_batch_size = 1000
+## For failed writes, telegraf will cache metric_buffer_limit metrics for each
+## output, and will flush this buffer on a successful write. Oldest metrics
+## are dropped first when this buffer fills.
+metric_buffer_limit = 10000
+## Collection jitter is used to jitter the collection by a random amount.
+## Each plugin will sleep for a random time within jitter before collecting.
+## This can be used to avoid many plugins querying things like sysfs at the
+## same time, which can have a measurable effect on the system.
+collection_jitter = "0s"
+## Default flushing interval for all outputs. You shouldn't set this below
+## interval. Maximum flush_interval will be flush_interval + flush_jitter
+flush_interval = "10s"
+## Jitter the flush interval by a random amount. This is primarily to avoid
+## large write spikes for users running a large number of telegraf instances.
+## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
+flush_jitter = "0s"
+## By default, precision will be set to the same timestamp order as the
+## collection interval, with the maximum being 1s.
+## Precision will NOT be used for service inputs, such as logparser and statsd.
+## Valid values are "ns", "us" (or "µs"), "ms", "s".
+precision = ""
+## Run telegraf in debug mode
+debug = false
+## Run telegraf in quiet mode
+quiet = false
+## Override default hostname, if empty use os.Hostname()
+hostname = ""
+## If set to true, do no set the "host" tag in the telegraf agent.
+omit_hostname = false
+
+###############################################################################
+#                            OUTPUT PLUGINS                                   #
+###############################################################################
+
+# Configuration for influxdb server to send metrics to
+[[outputs.influxdb]]
+
+## The full HTTP or UDP endpoint URL for your InfluxDB instance.
+## Multiple urls can be specified as part of the same cluster,
+## this means that only ONE of the urls will be written to each interval.
+# urls = ["udp://localhost:8089"] # UDP endpoint example
+
+urls = ["$INFLUXDB_URI"] # required
+
+
+## The target database for metrics (telegraf will create it if not exists).
+database = "telegraf" # required
+## Retention policy to write to. Empty string writes to the default rp.
+retention_policy = ""
+## Write consistency (clusters only), can be: "any", "one", "quorum", "all"
+write_consistency = "any"
+## Write timeout (for the InfluxDB client), formatted as a string.
+## If not provided, will default to 5s. 0s means no timeout (not recommended).
+timeout = "5s"
+username = "telegraf"
+password = "nimda"
+## Set the user agent for HTTP POSTs (can be useful for log differentiation)
+user_agent = "telegraf-agent"
+## Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
+# udp_payload = 512
+
+## Optional SSL Config
+# ssl_ca = "/etc/telegraf/ca.pem"
+# ssl_cert = "/etc/telegraf/cert.pem"
+# ssl_key = "/etc/telegraf/key.pem"
+## Use SSL but skip chain & host verification
+# insecure_skip_verify = false
+
+
+###############################################################################
+#                            INPUT PLUGINS                                    #
+###############################################################################
+
+# Read metrics about cpu usage
+[[inputs.cpu]]
+## Whether to report per-cpu stats or not
+percpu = true
+## Whether to report total system cpu stats or not
+totalcpu = true
+## Comment this line if you want the raw CPU time metrics
+fielddrop = ["time_*"]
+collect_cpu_time = true
+report_active = false
+
+# Read metrics about disk usage by mount point
+[[inputs.disk]]
+#
+#
+## By default, telegraf gather stats for all mountpoints.
+## Setting mountpoints will restrict the stats to the specified mountpoints.
+# mount_points = ["/"]
+
+## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually
+## present on /run, /var/run, /dev/shm or /dev).
+ignore_fs = ["tmpfs", "devtmpfs"]
+
+[[inputs.net]]
+# no configuration
+# Read metrics about memory usage
+[[inputs.mem]]
+# no configuration
+
+# Read metrics about system load & uptime
+[[inputs.system]]
+# no configuration
+[[inputs.processes]]
+[[inputs.netstat]]
+[[inputs.kernel]]
+
+# Read flattened metrics from one or more JSON HTTP endpoints
+#[[inputs.httpjson]]
+# a name for the service being polled
+#name_override = "monitoring-sebson"
+
+# Your servers
+#servers = [
+#  "http://localhost:4444/status"
+#]
+#response_timeout = "5s"
+## HTTP method to use: GET or POST (case-sensitive)
+#method = "GET"
+[[inputs.interrupts]]
+  ## To filter which IRQs to collect, make use of tagpass / tagdrop, i.e.
+  # [inputs.interrupts.tagdrop]
+    # irq = [ "NET_RX", "TASKLET" ]
+ # Collects conntrack stats from the configured directories and files.
+[[inputs.conntrack]]
+  ## The following defaults would work with multiple versions of conntrack.
+  ## Note the nf_ and ip_ filename prefixes are mutually exclusive across
+  ## kernel versions, as are the directory locations.
+
+  ## Superset of filenames to look for within the conntrack dirs.
+  ## Missing files will be ignored.
+  files = ["ip_conntrack_count","ip_conntrack_max",
+           "nf_conntrack_count","nf_conntrack_max"]
+
+  ## Directories to search within for the conntrack files above.
+  ## Missing directrories will be ignored.
+  dirs = ["/proc/sys/net/ipv4/netfilter","/proc/sys/net/netfilter"]
				`@@ -0,0 +1 @@`
				`# monitoring-grafana-influxdb-telegraf-prometheus`