adding some base for monitoring, before adding prometheus and the rest

2018-06-09 10:29:15 +02:00
commit 9487fb10aa
8 changed files with 10657 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,73 @@
 # swp vim edits
 *.swp
 *.swo
 .os_release.sh
 .configuration.cluster  
 .configuration.installs
 .awscli-bundle/
 # Python
 *.pyc
 **/.cache
 **/.coverage
 # Eclipse related files
 .classpath
 .cproject
 .deps
 .project
 .settings
 .metadata/
 # MAC DS files
 .DS_Store
 # STS
 .springBeans
 # Maven and build related
 target/
 packages.mk
 # IntelliJ IDEA related files
 *.iml
 *.iws
 *.ipr
 .idea/
 .sonar-ide.properties
 *.orig
 *rebel*.xml
 .idea/
 /INSTALL
 /Makefile
 /aclocal.m4
 /autom4te.cache/
 /build/aux/
 /config.log
 /config.status
 /configure
 # sed related files
 **/*.bak_remove
 # terraform state files
 **/.terraform
 **/*.tfstate
 **/*.tfstate.backup
 **/.terraform.tfstate.lock.info
 # Folders and files
 certificates
 terraform
 configuration.cluster
 configuration.installs
 other
 ec2
 monitoring.backup
 monitoring/monitoring_*_dir
 **/*.bak_remove
 **/*.wrapped
 data
--- a/README.md
+++ b/README.md
@@ -0,0 +1 @@
 # monitoring-grafana-influxdb-telegraf-prometheus
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,54 @@
 version: '3'
 networks:
  public: {}
 volumes:
  grafana_lib: {}
  grafana_ds: 
 services:
  influxdb:
    image: influxdb:alpine
    container_name: influxdb
    ports:
      - "8086:8086"
    networks:
      - public
    volumes:
      - ./data/influxdb:/var/lib/influxdb
    environment:
      INFLUXDB_REPORTING_DISABLED: "true"
      INFLUXDB_DB: telegraf
      INFLUXDB_USER: telegraf
      INFLUXDB_USER_PASSWORD: nimda
  grafana:
      image: grafana/grafana:5.1.3
      container_name: grafana
      ports:
        - "3000:3000"
      networks:
        - public
      volumes:
        - grafana_lib:/var/lib/grafana
        - grafana_ds:/var/lib/grafana/ds:rw
        - ${PWD}/grafana/add_datasources.sh:/var/lib/grafana/ds/add_datasources.sh
      environment:
        GF_AUTH_ANONYMOUS_ENABLED: "true"
        GF_AUTH_ANONYMOUS_ORG_ROLE: "Admin"
        INFLUXDB_URI: "http://influxdb:8086"
        INFLUXDB_DB: telegraf
        INFLUXDB_USER: telegraf
        INFLUXDB_USER_PASSWORD: nimda
      command: ["bash", "/var/lib/grafana/ds/add_datasources.sh"]
  telegraf:
    image: telegraf:latest
    container_name: telegraf
    network_mode: "host"
    volumes:
      - ./telegraf.conf:/etc/telegraf/telegraf.conf:ro
    environment:
      # real influx host
      INFLUXDB_URI: "http://localhost:8086"
--- a/grafana/add_dashboards.sh
+++ b/grafana/add_dashboards.sh
@@ -0,0 +1,97 @@
 #!/bin/bash
 set -e
 NC='\033[0m'
 RED='\033[00;31m'
 GREEN='\033[00;32m'
 YELLOW='\033[00;33m'
 BLUE='\033[00;34m'
 PURPLE='\033[00;35m'
 CYAN='\033[00;36m'
 LIGHTGRAY='\033[00;37m'
 MAGENTA='\033[00;35m'
 LRED='\033[01;31m'
 LGREEN='\033[01;32m'
 LYELLOW='\033[01;33m'
 LBLUE='\033[01;34m'
 LPURPLE='\033[01;35m'
 LCYAN='\033[01;36m'
 WHITE='\033[01;37m'
 GRAFANA_URL=http://admin:admin@localhost:3000
 grafana_api() {
  local verb=$1
  local url=$2
  local params=$3
  local bodyfile=$4
  local response
  local cmd
  cmd="curl -L -s --fail -H \"Accept: application/json\" -H \"Content-Type: application/json\" -X ${verb} -k ${GRAFANA_URL}${url}"
  [[ -n "${params}" ]] && cmd="${cmd} -d \"${params}\""
  [[ -n "${bodyfile}" ]] && cmd="${cmd} --data @${bodyfile}"
  echo -e "Running ${cmd}"
  eval ${cmd} || return 1
  return 0
 }
 wait_for_api() {
  echo -e "${BLUE}Waiting for Grafana to be available...${NC}"
  while ! grafana_api GET /api/user/preferences
  do
    echo -e "${BLUE}Waiting still...${NC}"
    sleep 15
  done
 }
 replace_datasource() {
   local dashboard_file=$1
   local datasource_name=$2
   cmd="sed -i.bak_remove \"s/\\\${DS_INFLUXDB}/${datasource_name}/g\" ${dashboard_file}"
   eval ${cmd} || return 1
   return 0
 }
 install_dashboards() {
  local dashboard
  for dashboard in dashboards/*.json
  do
  if [[ $(grep "\"name\": \"DS_INFLUXDB\"," ${dashboard}) ]]; then
    echo -e "${PURPLE}Dashboard ${dashboard} seems to be for InfluxDB datasource${NC}"
    datasource_name="influxdb"
  fi 
  if [[ $(grep "\"name\": \"DS_PROMETHEUS\"," ${dashboard}) ]]; then
    echo -e "${PURPLE}Dashboard ${dashboard} seems to be for Prometheus datasource${NC}"
    datasource_name="prometheus"
  fi 
    if [[ -f "${dashboard}" ]]; then
      echo -e "${LCYAN}Installing dashboard ${dashboard}${NC}"
      replace_datasource ${dashboard} ${datasource_name}
      # backup will be created before wrapping dashboard ^
      #echo -e "{\"dashboard\": `cat $dashboard`}" > "${dashboard}.wrapped"
      cp ${dashboard} ${dashboard}.wrapped
      sed -i '1s/^/{"dashboard":\n/' ${dashboard}.wrapped
      echo "}" >> ${dashboard}.wrapped
      if grafana_api POST /api/dashboards/db "" "${dashboard}.wrapped"; then
        echo -e "\n** ${GREEN}installed ok **${NC}"
      else
        echo -e "\n** ${RED}installation of: ${PURPLE}\"${dashboard}\"${RED} failed **${NC}"
      fi
    fi
  #rm ${dashboard}.wrapped
  done
 }
 configure_grafana() {
  wait_for_api
  install_dashboards
 }
 configure_grafana
--- a/grafana/add_datasources.sh
+++ b/grafana/add_datasources.sh
@@ -0,0 +1,31 @@
 #!/bin/bash
 #set -e
 # ADD INFLUXDB DATASOURCE
 curl -s -v -H "Content-Type: application/json" \
    -XPOST http://admin:admin@localhost:3000/api/datasources \
    -d @- <<EOF
 {
    "name": "influxdb",
    "type": "influxdb",
    "access": "proxy",
    "url": "http://influxdb:8086",
    "database": "telegraf",
    "user":"telegraf",
    "password":"nimda",
    "basicAuth":false
 }
 EOF
 ## ADD PROMETHEUS DATASOURCE
 curl -s -v -H "Content-Type: application/json" \
    -XPOST http://admin:admin@localhost:3000/api/datasources \
    -d @- <<EOF
 {
    "name": "prometheus",
    "type": "prometheus",
    "access": "proxy",
    "url": "http://prometheus:9090"
 }
 EOF
--- a/grafana/dashboards/prometheus-system.json
+++ b/grafana/dashboards/prometheus-system.json
--- a/grafana/dashboards/telegraf-dashboard.json
+++ b/grafana/dashboards/telegraf-dashboard.json
--- a/telegraf.conf
+++ b/telegraf.conf
@@ -0,0 +1,148 @@
 [agent]
 ## Default data collection interval for all inputs
 interval = "10s"
 ## Rounds collection interval to 'interval'
 ## ie, if interval="10s" then always collect on :00, :10, :20, etc.
 round_interval = true
 ## Telegraf will send metrics to outputs in batches of at
 ## most metric_batch_size metrics.
 metric_batch_size = 1000
 ## For failed writes, telegraf will cache metric_buffer_limit metrics for each
 ## output, and will flush this buffer on a successful write. Oldest metrics
 ## are dropped first when this buffer fills.
 metric_buffer_limit = 10000
 ## Collection jitter is used to jitter the collection by a random amount.
 ## Each plugin will sleep for a random time within jitter before collecting.
 ## This can be used to avoid many plugins querying things like sysfs at the
 ## same time, which can have a measurable effect on the system.
 collection_jitter = "0s"
 ## Default flushing interval for all outputs. You shouldn't set this below
 ## interval. Maximum flush_interval will be flush_interval + flush_jitter
 flush_interval = "10s"
 ## Jitter the flush interval by a random amount. This is primarily to avoid
 ## large write spikes for users running a large number of telegraf instances.
 ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
 flush_jitter = "0s"
 ## By default, precision will be set to the same timestamp order as the
 ## collection interval, with the maximum being 1s.
 ## Precision will NOT be used for service inputs, such as logparser and statsd.
 ## Valid values are "ns", "us" (or "µs"), "ms", "s".
 precision = ""
 ## Run telegraf in debug mode
 debug = false
 ## Run telegraf in quiet mode
 quiet = false
 ## Override default hostname, if empty use os.Hostname()
 hostname = ""
 ## If set to true, do no set the "host" tag in the telegraf agent.
 omit_hostname = false
 ###############################################################################
 #                            OUTPUT PLUGINS                                   #
 ###############################################################################
 # Configuration for influxdb server to send metrics to
 [[outputs.influxdb]]
 ## The full HTTP or UDP endpoint URL for your InfluxDB instance.
 ## Multiple urls can be specified as part of the same cluster,
 ## this means that only ONE of the urls will be written to each interval.
 # urls = ["udp://localhost:8089"] # UDP endpoint example
 urls = ["$INFLUXDB_URI"] # required
 ## The target database for metrics (telegraf will create it if not exists).
 database = "telegraf" # required
 ## Retention policy to write to. Empty string writes to the default rp.
 retention_policy = ""
 ## Write consistency (clusters only), can be: "any", "one", "quorum", "all"
 write_consistency = "any"
 ## Write timeout (for the InfluxDB client), formatted as a string.
 ## If not provided, will default to 5s. 0s means no timeout (not recommended).
 timeout = "5s"
 username = "telegraf"
 password = "nimda"
 ## Set the user agent for HTTP POSTs (can be useful for log differentiation)
 user_agent = "telegraf-agent"
 ## Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
 # udp_payload = 512
 ## Optional SSL Config
 # ssl_ca = "/etc/telegraf/ca.pem"
 # ssl_cert = "/etc/telegraf/cert.pem"
 # ssl_key = "/etc/telegraf/key.pem"
 ## Use SSL but skip chain & host verification
 # insecure_skip_verify = false
 ###############################################################################
 #                            INPUT PLUGINS                                    #
 ###############################################################################
 # Read metrics about cpu usage
 [[inputs.cpu]]
 ## Whether to report per-cpu stats or not
 percpu = true
 ## Whether to report total system cpu stats or not
 totalcpu = true
 ## Comment this line if you want the raw CPU time metrics
 fielddrop = ["time_*"]
 collect_cpu_time = true
 report_active = false
 # Read metrics about disk usage by mount point
 [[inputs.disk]]
 #
 #
 ## By default, telegraf gather stats for all mountpoints.
 ## Setting mountpoints will restrict the stats to the specified mountpoints.
 # mount_points = ["/"]
 ## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually
 ## present on /run, /var/run, /dev/shm or /dev).
 ignore_fs = ["tmpfs", "devtmpfs"]
 [[inputs.net]]
 # no configuration
 # Read metrics about memory usage
 [[inputs.mem]]
 # no configuration
 # Read metrics about system load & uptime
 [[inputs.system]]
 # no configuration
 [[inputs.processes]]
 [[inputs.netstat]]
 [[inputs.kernel]]
 # Read flattened metrics from one or more JSON HTTP endpoints
 #[[inputs.httpjson]]
 # a name for the service being polled
 #name_override = "monitoring-sebson"
 # Your servers
 #servers = [
 #  "http://localhost:4444/status"
 #]
 #response_timeout = "5s"
 ## HTTP method to use: GET or POST (case-sensitive)
 #method = "GET"
 [[inputs.interrupts]]
  ## To filter which IRQs to collect, make use of tagpass / tagdrop, i.e.
  # [inputs.interrupts.tagdrop]
    # irq = [ "NET_RX", "TASKLET" ]
 # Collects conntrack stats from the configured directories and files.
 [[inputs.conntrack]]
  ## The following defaults would work with multiple versions of conntrack.
  ## Note the nf_ and ip_ filename prefixes are mutually exclusive across
  ## kernel versions, as are the directory locations.
  ## Superset of filenames to look for within the conntrack dirs.
  ## Missing files will be ignored.
  files = ["ip_conntrack_count","ip_conntrack_max",
           "nf_conntrack_count","nf_conntrack_max"]
  ## Directories to search within for the conntrack files above.
  ## Missing directrories will be ignored.
  dirs = ["/proc/sys/net/ipv4/netfilter","/proc/sys/net/netfilter"]
		`@@ -0,0 +1 @@`
							`# monitoring-grafana-influxdb-telegraf-prometheus`