123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176 |
- // This file is subject to a 1-clause BSD license.
- // Its contents can be found in the enclosed LICENSE file.
- package url
- import (
- "bytes"
- "fmt"
- "html"
- "io"
- "net/http"
- "net/url"
- "regexp"
- "strings"
- "notabug.org/mouz/bot/irc"
- "notabug.org/mouz/bot/irc/proto"
- "notabug.org/mouz/bot/plugins/url/youtube"
- )
- var (
- // regURL is used by readUrl to extract web page URLs from incoming
- // PRIVMSG contents.
- regURL = regexp.MustCompile(`\b[Hh][Tt][Tt][Pp][Ss]?\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]+(\:[0-9]+)?(/\S*)?\b`)
- // These values are used to extract title contents from HTML.
- bOpenTitle1 = []byte("<title>")
- bOpenTitle2 = []byte("<title ")
- bCloseTitle = []byte("</title>")
- bCloseTag = []byte(">")
- )
- // fetchDescription attempts to retrieve some descriptive information
- // for a given URL. This typically is the title of the page or video.
- // For youtube URLs also the duration of the video is retrieved. The
- // resulting description is msg'd back to IRC.
- func fetchDescription(w irc.ResponseWriter, r *irc.Request, url, apiKey string) {
- var descr string
- // For youtube URLs use the API. For other URLs get the html title.
- if id := isYoutube(url); len(id) > 0 {
- videoInfo, err := youtube.GetVideoInfo(apiKey, id)
- if err != nil {
- return
- }
- descr = fmt.Sprintf("%s", videoInfo.Title)
- descr += fmt.Sprintf(TextYoutubeDuration, videoInfo.Duration)
- } else {
- descr = fetchTitle(url)
- }
- if len(descr) == 0 {
- return
- }
- // If the description exactly matches one of the titles that we
- // want to ignore, do not show it.
- if Ignore[descr] {
- return
- }
- // Show the title to the channel from whence the URL came.
- _ = proto.PrivMsg(w, r.Target, TextDisplay, r.SenderName, descr)
- }
- // fetchTitle retreives and returns the title from the html header
- // for non-youtube URLs. On error it returns the empty string.
- func fetchTitle(url string) string {
- // Ensure the url targets a HTML page. We do this by issueing a HEAD
- // request and checking its content type header.
- resp, err := http.Head(url)
- if err != nil {
- return ""
- }
- _ = resp.Body.Close()
- ctype := strings.ToLower(resp.Header.Get("Content-Type"))
- if strings.Index(ctype, "text/html") == -1 &&
- strings.Index(ctype, "text/xhtml") == -1 {
- return ""
- }
- // We have an HTML document -- Fetch its contents.
- resp, err = http.Get(url)
- if err != nil {
- return ""
- }
- // buf defines the maximum amount of data we will be reading from a page,
- // before stopping our search for the <title> tag.
- //
- // 16kB is a chunky buffer, but some sites packa a ludicrous amount of
- // crud in their page headers, before getting to the <title> tag.
- var buf [1024 * 16]byte
- // Read the body.
- n, err := io.ReadFull(resp.Body, buf[:])
- _ = resp.Body.Close()
- if err != nil && n <= 0 {
- return "" // Exit only if no data was read at all.
- }
- body := buf[:n]
- // Extract the title.
- s := bytes.Index(bytes.ToLower(body), bOpenTitle1)
- if s == -1 {
- // title could be something like:
- //
- // <title xml:lang="en-US">....</title>
- //
- s = bytes.Index(bytes.ToLower(body), bOpenTitle2)
- if s == -1 {
- return ""
- }
- body = body[s+len(bOpenTitle2):]
- s = bytes.Index(body, bCloseTag)
- if s == -1 {
- return ""
- }
- body = body[s+1:]
- } else {
- body = body[s+len(bOpenTitle1):]
- }
- e := bytes.Index(bytes.ToLower(body), bCloseTitle)
- if e == -1 {
- e = len(body) - 1
- }
- body = bytes.TrimSpace(body[:e])
- if len(body) == 0 {
- return ""
- }
- return html.UnescapeString(string(body))
- }
- // isYoutube returns a video ID and true if v denotes a recognized youtube
- // video URL. Returns an empty string otherwise.
- func isYoutube(v string) string {
- u, err := url.Parse(v)
- if err != nil {
- return ""
- }
- if strings.EqualFold(u.Host, "youtube.com") ||
- strings.EqualFold(u.Host, "www.youtube.com") ||
- strings.EqualFold(u.Host, "m.youtube.com") {
- id := strings.TrimSpace(u.Query().Get("v"))
- return id
- }
- if strings.EqualFold(u.Host, "youtu.be") ||
- strings.EqualFold(u.Host, "www.youtu.be") ||
- strings.EqualFold(u.Host, "m.youtu.be") {
- id := u.RequestURI()
- if strings.HasPrefix(id, "/") {
- id = id[1:]
- }
- return id
- }
- return ""
- }
|