http.go 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. // This file is subject to a 1-clause BSD license.
  2. // Its contents can be found in the enclosed LICENSE file.
  3. package url
  4. import (
  5. "bytes"
  6. "fmt"
  7. "html"
  8. "io"
  9. "net/http"
  10. "net/url"
  11. "regexp"
  12. "strings"
  13. "notabug.org/mouz/bot/irc"
  14. "notabug.org/mouz/bot/irc/proto"
  15. "notabug.org/mouz/bot/plugins/url/youtube"
  16. )
  17. var (
  18. // regURL is used by readUrl to extract web page URLs from incoming
  19. // PRIVMSG contents.
  20. regURL = regexp.MustCompile(`\b[Hh][Tt][Tt][Pp][Ss]?\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]+(\:[0-9]+)?(/\S*)?\b`)
  21. // These values are used to extract title contents from HTML.
  22. bOpenTitle1 = []byte("<title>")
  23. bOpenTitle2 = []byte("<title ")
  24. bCloseTitle = []byte("</title>")
  25. bCloseTag = []byte(">")
  26. )
  27. // fetchDescription attempts to retrieve some descriptive information
  28. // for a given URL. This typically is the title of the page or video.
  29. // For youtube URLs also the duration of the video is retrieved. The
  30. // resulting description is msg'd back to IRC.
  31. func fetchDescription(w irc.ResponseWriter, r *irc.Request, url, apiKey string) {
  32. var descr string
  33. // For youtube URLs use the API. For other URLs get the html title.
  34. if id := isYoutube(url); len(id) > 0 {
  35. videoInfo, err := youtube.GetVideoInfo(apiKey, id)
  36. if err != nil {
  37. return
  38. }
  39. descr = fmt.Sprintf("%s", videoInfo.Title)
  40. descr += fmt.Sprintf(TextYoutubeDuration, videoInfo.Duration)
  41. } else {
  42. descr = fetchTitle(url)
  43. }
  44. if len(descr) == 0 {
  45. return
  46. }
  47. // If the description exactly matches one of the titles that we
  48. // want to ignore, do not show it.
  49. if Ignore[descr] {
  50. return
  51. }
  52. // Show the title to the channel from whence the URL came.
  53. _ = proto.PrivMsg(w, r.Target, TextDisplay, r.SenderName, descr)
  54. }
  55. // fetchTitle retreives and returns the title from the html header
  56. // for non-youtube URLs. On error it returns the empty string.
  57. func fetchTitle(url string) string {
  58. // Ensure the url targets a HTML page. We do this by issueing a HEAD
  59. // request and checking its content type header.
  60. resp, err := http.Head(url)
  61. if err != nil {
  62. return ""
  63. }
  64. _ = resp.Body.Close()
  65. ctype := strings.ToLower(resp.Header.Get("Content-Type"))
  66. if strings.Index(ctype, "text/html") == -1 &&
  67. strings.Index(ctype, "text/xhtml") == -1 {
  68. return ""
  69. }
  70. // We have an HTML document -- Fetch its contents.
  71. resp, err = http.Get(url)
  72. if err != nil {
  73. return ""
  74. }
  75. // buf defines the maximum amount of data we will be reading from a page,
  76. // before stopping our search for the <title> tag.
  77. //
  78. // 16kB is a chunky buffer, but some sites packa a ludicrous amount of
  79. // crud in their page headers, before getting to the <title> tag.
  80. var buf [1024 * 16]byte
  81. // Read the body.
  82. n, err := io.ReadFull(resp.Body, buf[:])
  83. _ = resp.Body.Close()
  84. if err != nil && n <= 0 {
  85. return "" // Exit only if no data was read at all.
  86. }
  87. body := buf[:n]
  88. // Extract the title.
  89. s := bytes.Index(bytes.ToLower(body), bOpenTitle1)
  90. if s == -1 {
  91. // title could be something like:
  92. //
  93. // <title xml:lang="en-US">....</title>
  94. //
  95. s = bytes.Index(bytes.ToLower(body), bOpenTitle2)
  96. if s == -1 {
  97. return ""
  98. }
  99. body = body[s+len(bOpenTitle2):]
  100. s = bytes.Index(body, bCloseTag)
  101. if s == -1 {
  102. return ""
  103. }
  104. body = body[s+1:]
  105. } else {
  106. body = body[s+len(bOpenTitle1):]
  107. }
  108. e := bytes.Index(bytes.ToLower(body), bCloseTitle)
  109. if e == -1 {
  110. e = len(body) - 1
  111. }
  112. body = bytes.TrimSpace(body[:e])
  113. if len(body) == 0 {
  114. return ""
  115. }
  116. return html.UnescapeString(string(body))
  117. }
  118. // isYoutube returns a video ID and true if v denotes a recognized youtube
  119. // video URL. Returns an empty string otherwise.
  120. func isYoutube(v string) string {
  121. u, err := url.Parse(v)
  122. if err != nil {
  123. return ""
  124. }
  125. if strings.EqualFold(u.Host, "youtube.com") ||
  126. strings.EqualFold(u.Host, "www.youtube.com") ||
  127. strings.EqualFold(u.Host, "m.youtube.com") {
  128. id := strings.TrimSpace(u.Query().Get("v"))
  129. return id
  130. }
  131. if strings.EqualFold(u.Host, "youtu.be") ||
  132. strings.EqualFold(u.Host, "www.youtu.be") ||
  133. strings.EqualFold(u.Host, "m.youtu.be") {
  134. id := u.RequestURI()
  135. if strings.HasPrefix(id, "/") {
  136. id = id[1:]
  137. }
  138. return id
  139. }
  140. return ""
  141. }