main.go 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. // pinghealth takes a hostfile from the command line, pings each of them until the process is killed.
  2. //
  3. // timout, wait between pings and retries can be configured with flags.
  4. //
  5. // metrics are logged to influxdb using https://github.com/rcrowley/go-metrics
  6. //
  7. // ping construction is done by https://github.com/erikh/ping
  8. package main
  9. import (
  10. "bufio"
  11. "flag"
  12. "net"
  13. "net/http"
  14. "os"
  15. "os/signal"
  16. "time"
  17. "github.com/Sirupsen/logrus"
  18. "github.com/cryptix/go/backoff"
  19. "github.com/cryptix/go/logging"
  20. "github.com/erikh/ping"
  21. "github.com/prometheus/client_golang/prometheus"
  22. "gopkg.in/errgo.v1"
  23. )
  24. var (
  25. // flags
  26. listenAddr = flag.String("http", "localhost:1337", "on what http address to listen")
  27. name = flag.String("name", "undefined", "name to give your metrics")
  28. retry = flag.Int("r", 10, "number of retries before aborting")
  29. timeout = flag.Duration("t", 100*time.Millisecond, "what timeout to use for each ping")
  30. wait = flag.Duration("w", 500*time.Millisecond, "how long to wait between ping bursts")
  31. // globals
  32. log = logging.Logger("pingmon")
  33. )
  34. type pinger struct {
  35. ip *net.IPAddr
  36. host string
  37. done chan struct{}
  38. timeouts prometheus.Counter
  39. latency prometheus.Summary
  40. }
  41. func NewPinger(s string) (*pinger, error) {
  42. var (
  43. err error
  44. p pinger
  45. )
  46. if len(s) < 0 {
  47. return nil, errgo.New("host cant be empty")
  48. }
  49. p.host = s
  50. p.ip, err = net.ResolveIPAddr("ip6", s)
  51. if err != nil {
  52. log.Warningf("%15s - ResolveIPAddr(ipv6) failed - %s", p.host, err)
  53. p.ip, err = net.ResolveIPAddr("ip4", s)
  54. if err != nil {
  55. return nil, errgo.Notef(err, "ResolveIPAddr(ipv4) failed - %s", p.host)
  56. }
  57. }
  58. p.timeouts = prometheus.NewCounter(prometheus.CounterOpts{
  59. Name: "pingmon_timeouts",
  60. Help: "number of timeouts occured",
  61. ConstLabels: prometheus.Labels{
  62. "name": *name,
  63. "host": p.host,
  64. },
  65. })
  66. if err = prometheus.Register(p.timeouts); err != nil {
  67. return nil, errgo.Notef(err, "Register of timeouts failed for %s", p.host)
  68. }
  69. p.latency = prometheus.NewSummary(prometheus.SummaryOpts{
  70. Name: "pingmon_latency",
  71. Help: "how big is the latency in avaerage",
  72. ConstLabels: prometheus.Labels{
  73. "name": *name,
  74. "host": p.host,
  75. },
  76. })
  77. if err = prometheus.Register(p.latency); err != nil {
  78. return nil, errgo.Notef(err, "Register of latency failed for %s", p.host)
  79. }
  80. p.done = make(chan struct{})
  81. return &p, nil
  82. }
  83. func (p *pinger) run() {
  84. var (
  85. start time.Time
  86. attempt int
  87. )
  88. start = time.Now()
  89. for {
  90. select {
  91. case <-time.After(backoff.Default.Duration(attempt)):
  92. if attempt > *retry {
  93. log.WithFields(logrus.Fields{
  94. "attempt": attempt,
  95. "host": p.host,
  96. }).Warning("attempts exceeded")
  97. attempt = 0
  98. time.Sleep(1 * time.Minute)
  99. continue
  100. }
  101. err := ping.Pinger(p.ip, *timeout+backoff.Default.Duration(attempt))
  102. if err != nil { // retry
  103. attempt++
  104. log.WithFields(logrus.Fields{
  105. "host": p.host,
  106. "error": err,
  107. "attempt": attempt,
  108. "took": time.Since(start),
  109. }).Info("ping failed")
  110. p.timeouts.Inc()
  111. continue
  112. }
  113. p.latency.Observe(time.Since(start).Seconds())
  114. time.Sleep(*wait)
  115. attempt = 0
  116. start = time.Now() // reset after sucessfull ping - timer updates include timeout duration
  117. case <-p.done: // quit
  118. return
  119. }
  120. }
  121. }
  122. func main() {
  123. flag.Parse()
  124. var (
  125. err error
  126. hostf *os.File
  127. hosts []*pinger
  128. )
  129. if len(flag.Args()) != 1 {
  130. log.Warning("No hostsfile to ping. quiting.")
  131. os.Exit(1)
  132. }
  133. if flag.Args()[0] == "-" {
  134. hostf = os.Stdin
  135. } else {
  136. hostf, err = os.Open(flag.Args()[0])
  137. logging.CheckFatal(err)
  138. defer hostf.Close()
  139. }
  140. shutdown := make(chan os.Signal)
  141. done := make(chan struct{})
  142. signal.Notify(shutdown, os.Interrupt, os.Kill)
  143. go func() {
  144. for sig := range shutdown {
  145. log.Warningf("captured %v, stopping pingers and exiting..", sig)
  146. for _, h := range hosts {
  147. close(h.done)
  148. }
  149. close(done)
  150. }
  151. }()
  152. go func() {
  153. lis, err := net.Listen("tcp", *listenAddr)
  154. logging.CheckFatal(err)
  155. http.Serve(lis, prometheus.Handler())
  156. }()
  157. hostSc := bufio.NewScanner(hostf)
  158. for hostSc.Scan() {
  159. h, err := NewPinger(hostSc.Text())
  160. logging.CheckFatal(err)
  161. hosts = append(hosts, h)
  162. }
  163. logging.CheckFatal(hostSc.Err())
  164. for _, h := range hosts {
  165. go h.run()
  166. }
  167. <-done
  168. }