123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153 |
- // This file is subject to a 1-clause BSD license.
- // Its contents can be found in the enclosed LICENSE file.
- package url
- import (
- "bytes"
- "fmt"
- "html"
- "io"
- "net/http"
- "net/url"
- "regexp"
- "strings"
- "notabug.org/mouz/bot/irc"
- "notabug.org/mouz/bot/irc/proto"
- "notabug.org/mouz/bot/plugins/url/youtube"
- )
- var (
- // regUrl is used by readUrl to extract web page URLs from incoming
- // PRIVMSG contents.
- regUrl = regexp.MustCompile(`\b[Hh][Tt][Tt][Pp][Ss]?\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]+(\:[0-9]+)?(/\S*)?\b`)
- // These values are used to extract title contents from HTML.
- bOpenTitle1 = []byte("<title>")
- bOpenTitle2 = []byte("<title ")
- bCloseTitle = []byte("</title>")
- bCloseTag = []byte(">")
- )
- // fetchTitle attempts to retrieve the title element for a given url.
- func fetchTitle(w irc.ResponseWriter, r *irc.Request, url, apiKey string) {
- // Ensure the url targets a HTML page. We do this by issueing a HEAD
- // request and checking its content type header.
- resp, err := http.Head(url)
- if err != nil {
- return
- }
- resp.Body.Close()
- ctype := strings.ToLower(resp.Header.Get("Content-Type"))
- if strings.Index(ctype, "text/html") == -1 &&
- strings.Index(ctype, "text/xhtml") == -1 {
- return
- }
- // We have an HTML document -- Fetch its contents.
- resp, err = http.Get(url)
- if err != nil {
- return
- }
- // buf defines the maximum amount of data we will be reading from a page,
- // before stopping our search for the <title> tag.
- //
- // 16kB is a chunky buffer, but some sites packa a ludicrous amount of
- // crud in their page headers, before getting to the <title> tag.
- var buf [1024 * 16]byte
- // Read the body.
- n, err := io.ReadFull(resp.Body, buf[:])
- resp.Body.Close()
- if err != nil && n <= 0 {
- return // Exit only if no data was read at all.
- }
- body := buf[:n]
- // Extract the title.
- s := bytes.Index(bytes.ToLower(body), bOpenTitle1)
- if s == -1 {
- // title could be something like:
- //
- // <title xml:lang="en-US">....</title>
- //
- s = bytes.Index(bytes.ToLower(body), bOpenTitle2)
- if s == -1 {
- return
- }
- body = body[s+len(bOpenTitle2):]
- s = bytes.Index(body, bCloseTag)
- if s == -1 {
- return
- }
- body = body[s+1:]
- } else {
- body = body[s+len(bOpenTitle1):]
- }
- e := bytes.Index(bytes.ToLower(body), bCloseTitle)
- if e == -1 {
- e = len(body) - 1
- }
- body = bytes.TrimSpace(body[:e])
- if len(body) == 0 {
- return
- }
- title := html.UnescapeString(string(body))
- // If we are dealing with a youtube link, try to fetch the
- // video duration and append it to our response.
- if id := isYoutube(url); len(id) > 0 {
- info, err := youtube.GetVideoInfo(apiKey, id)
- if err == nil {
- title += fmt.Sprintf(TextYoutubeDuration, info.Duration)
- }
- }
- // If the title matches one of the titles that we want to ignore,
- // do not show it.
- if Ignore[title] {
- return
- }
- // Show the title to the channel from whence the URL came.
- proto.PrivMsg(w, r.Target, TextDisplay, r.SenderName, title)
- }
- // isYoutube returns a video ID and true if v denotes a recognized youtube
- // video URL. Returns an empty string otherwise.
- func isYoutube(v string) string {
- u, err := url.Parse(v)
- if err != nil {
- return ""
- }
- if strings.EqualFold(u.Host, "youtube.com") ||
- strings.EqualFold(u.Host, "www.youtube.com") {
- id := strings.TrimSpace(u.Query().Get("v"))
- return id
- }
- if strings.EqualFold(u.Host, "youtu.be") ||
- strings.EqualFold(u.Host, "www.youtu.be") {
- id := u.RequestURI()
- if strings.HasPrefix(id, "/") {
- id = id[1:]
- }
- return id
- }
- return ""
- }
|