snooscraper 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. #!/bin/sh
  2. #SnooScraper - scrape reddit posts for content fitting certain criteria, and commonly used image sites
  3. #
  4. # Copyright 2018 Oliver Galvin <odg@riseup.net>
  5. #
  6. # This program is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # This program is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. #Error handling
  19. depends="sh curl jq sed grep find xargs"
  20. for c in $depends; do if ! command "$c" --help >/dev/null 2>&1; then echo "Error: dependency ${c} is missing. Aborting."; exit 1; fi; done
  21. if [ $# -gt 2 ]; then echo "Error: too many options. Aborting."; exit 1; fi
  22. if [ $# = 1 ] && [ ! "$1" = "-h" ] && [ ! "$1" = "--help" ]; then echo "Error: missing option. Aborting."; exit 1; fi
  23. trap 'echo "Interrupted, finishing early."; exit 1' INT
  24. #Default configuration values
  25. target=25; limit=100
  26. post=0; nonimage=0; gif=0; album=1
  27. insta=1; pinterest=1; tumblr=1
  28. #Global variables
  29. progname=$(basename "$0")
  30. name=$(date +%s); tally=0
  31. url=$2; url=${url#*//}; url=${url#m.}
  32. echo "${progname} - v0.2
  33. ---"
  34. #Get the configuration if it exists
  35. if [ -e "$(pwd)/config" ]; then config="$(pwd)/config"
  36. elif [ -e "${XDG_CONFIG_HOME}/snooscraper/config" ]; then config="${XDG_CONFIG_HOME}/snooscraper/config"
  37. elif [ -e "${HOME}/.snooscraper" ]; then config="${HOME}/.snooscraper"; fi
  38. if [ -e "$config" ]; then
  39. while read -r line; do
  40. line=$(echo "${line%%\#*}" | tr -d '[:space:]')
  41. name=${line%%=*}; value=${line##*=}
  42. [ -z "$line" ] && continue
  43. case $name in
  44. target|limit) eval "$name=$value" ;;
  45. post|nonimage|gif|album)
  46. case $value in
  47. only) if [ "$only" ]; then echo "Error: One one option can be set to 'only'. Aborting."; exit 1
  48. else eval "$name=1"; only="$name"
  49. fi ;;
  50. include) eval "$name=1" ;;
  51. off) eval "$name=0" ;;
  52. *) echo "Error: invalid value in config file. Aborting."; exit 1 ;;
  53. esac ;;
  54. insta|tumblr|pinterest)
  55. case $value in
  56. on) eval "$name=1" ;;
  57. off) eval "$name=0" ;;
  58. *) echo "Error: invalid value in config file. Aborting."; exit 1 ;;
  59. esac ;;
  60. *) echo "Error: invalid config file. Aborting."; exit 1 ;;
  61. esac
  62. done < "$config"
  63. elif [ ! "$1" = "-h" ] && [ ! $# = 0 ]; then echo "Warning: no config file found, using default parameters."; fi
  64. clean () {
  65. dir="$1"
  66. if [ -z "${dir}" ] || [ ! -d "${dir}" ]; then echo "Error: dump ${dir} does not exist. Aborting."; exit; fi
  67. echo "Tidying dump in ${dir} to remove failed downloads or duplicates/reposts."
  68. rm -f "${dir}/temp" "${dir}/found"
  69. printf "Removing empty files... "
  70. c=$(find "${dir}" -type f | wc -l)
  71. du -a "$dir" | grep '^0' | cut -f2 | xargs rm -f
  72. d=$(find "${dir}" -type f | wc -l)
  73. echo "done, $(( c-d )) files removed"
  74. printf "Removing corrupt/invalid files... "
  75. if [ "$post" = 0 ] && [ "$nonimage" = 0 ] || [ "$gif" = 0 ]; then
  76. data=$(file -i "$dir"/*)
  77. if [ "$post" = 0 ] && [ "$nonimage" = 0 ]; then echo "$data" | grep 'text/' | cut -d':' -f1 | xargs rm -f; fi
  78. if [ "$gif" = 0 ]; then echo "$data" | grep '/gif' | cut -d':' -f1 | xargs rm -f; fi
  79. fi
  80. if [ "$album" = 0 ]; then find "$dir" -name '*-*' -exec rm -f {} + ; fi
  81. c=$(find "${dir}" -type f | wc -l)
  82. echo "done, $(( d-c )) files removed"
  83. printf "Removing duplicates... "
  84. old=""
  85. b2sum "$dir"/* | sort | uniq -Dw 128 | while IFS= read -r line; do
  86. new=${line% *}
  87. if [ "$new" = "$old" ]; then rm -f "${line#* }"; fi
  88. old="$new"
  89. done
  90. d=$(find "${dir}" -type f | wc -l)
  91. echo "done, $(( c-d )) files removed"
  92. echo
  93. echo "${d} files remaining."
  94. }
  95. getinstapost () {
  96. #Downloads image from an instagram post, a video if gifs are enabled, multiple images if albums are enabled
  97. url="$1"; term="sharedData = "
  98. url="https://www.instagram.com$(echo "$url" | grep -Po '/p/[[:alnum:]]*')/"
  99. data=$(curl -sk --retry 1 "${url}" | grep -Po "${term}{.*}" | sed "s/$term//g")
  100. [ -z "$data" ] && return
  101. type=$(echo "$data" | jq -r '.entry_data.PostPage[0].graphql.shortcode_media.__typename')
  102. case ${type} in
  103. GraphImage) #Static image
  104. : $(( tally=tally+1 ))
  105. url=$(echo "$data" | jq -r '.entry_data.PostPage[0].graphql.shortcode_media.display_resources[2].src')
  106. ext=${url##*.}
  107. curl -sk --retry 1 "$url" -o "${name}.${ext}" & ;;
  108. GraphSidecar) #Album
  109. : $(( tally=tally+1 ))
  110. c=$(echo "$data" | jq -r '.entry_data.PostPage[0].graphql.shortcode_media.edge_sidecar_to_children.edges | length')
  111. l=${#c}; i=0
  112. list=$(echo "$data" | jq -r '.entry_data.PostPage[0].graphql.shortcode_media.edge_sidecar_to_children.edges[].node.display_resources[2].src')
  113. for url in $list; do
  114. : $(( i=i+1 ))
  115. j=$(printf "%0${l}d" $i)
  116. ext=${url##*.}
  117. curl -sk --retry 1 "$url" -o "${name}-${j}.${ext}" &
  118. done ;;
  119. GraphVideo) #Video clip
  120. if [ "$gif" = 1 ]; then
  121. : $(( tally=tally+1 ))
  122. url=$(echo "$data" | jq -r '.entry_data.PostPage[0].graphql.shortcode_media.video_url')
  123. ext=${url##*.}
  124. curl -sk --retry 1 "$url" -o "${name}.${ext}" &
  125. fi ;;
  126. esac
  127. }
  128. getinsta () {
  129. #Downloads images from an instagram account, including videos if gifs are enabled, multiple images if albums are enabled
  130. url="$1"; term="sharedData = "
  131. if echo "$url" | grep -q "/p/"; then getinstapost "$url"; exit; fi
  132. ua="Mozilla/5.0 (Windows NT 6.1; rv:59.0) Gecko/20100101 Firefox/59.0"
  133. query="42323d64886122307be10013ad2dcc44" #Constant as far as I can tell, means we want to query media
  134. first=50; count=0; has_next="true"
  135. prefix=".entry_data.ProfilePage[0].graphql.user"
  136. while [ "$count" -le "$target" ] && [ "$has_next" = "true" ]; do
  137. if [ "$count" -ge 12 ]; then #Subsequent pages
  138. after=$(echo "$data" | jq -r "${prefix}.edge_owner_to_timeline_media.page_info.end_cursor")
  139. variables="\"id\":\"${uid}\",\"first\":${first},\"after\":\"${after}\""
  140. x_instagram_gis=$(printf "%s:{%s}" "${rhx_gis}" "${variables}" | md5sum | cut -d' ' -f1)
  141. prefix=".data.user"
  142. url="https://www.instagram.com/graphql/query/?query_hash=${query}&variables=%7B${variables}%7D"
  143. data=$(curl -s -A "$ua" -b "csrftoken=$csrf" -H "X-Instagram-GIS: $x_instagram_gis" "$url")
  144. else #Initial page
  145. data=$(curl -s -A "$ua" -b "ig_cb=1" "https://${url}" | grep -Po "${term}{.*}" | sed "s/$term//g")
  146. rhx_gis=$(echo "$data" | jq -r '.rhx_gis')
  147. csrf=$(echo "$data" | jq -r '.config.csrf_token')
  148. uid=$(echo "$data" | jq -r "${prefix}.id")
  149. fi
  150. [ -z "$data" ] && return
  151. has_next=$(echo "$data" | jq -r "${prefix}.edge_owner_to_timeline_media.page_info.has_next_page")
  152. len=$(echo "$data" | jq -r "${prefix}.edge_owner_to_timeline_media.edges | length")
  153. i=0
  154. while [ "$i" -lt "$len" ]; do
  155. name=$(echo "$data" | jq -r "${prefix}.edge_owner_to_timeline_media.edges[$i].node.shortcode")
  156. type=$(echo "$data" | jq -r "${prefix}.edge_owner_to_timeline_media.edges[$i].node.__typename")
  157. case "$type" in
  158. GraphImage)
  159. : $(( tally=tally+1 ))
  160. url=$(echo "$data" | jq -r "${prefix}.edge_owner_to_timeline_media.edges[$i].node.display_url")
  161. ext=${url##*.}
  162. curl -sk --retry 1 "$url" -o "${name}.${ext}" & ;;
  163. GraphSidecar) if [ "$album" = 1 ]; then getinstapost "$name"; fi ;;
  164. GraphVideo) if [ "$gif" = 1 ]; then getinstapost "$name"; fi ;;
  165. esac
  166. i=$((i+1))
  167. done
  168. count=$((count+i))
  169. done
  170. }
  171. getpinterest () {
  172. #Download images from a pinterest board or pin
  173. url="$1"; term="'jsInit1'"
  174. data=$(curl -s "https://${url}" | grep -Po "${term}>.*<" | sed "s/$term//g" | tr -d '<>')
  175. [ -z "$data" ] && return
  176. if echo "$url" | grep -q "/pin/"; then #pin
  177. list=$(echo "$data" | jq -r '.resourceDataCache[0].data.images."736x".url')
  178. list="${list}"$(echo "$data" | jq -r '.resourceDataCache[1].data.related_pins_feed[].images."736x".url')
  179. else #board/profile
  180. list=$(echo "$data" | jq -r '.resourceDataCache[0].data.images."474x"[].url')
  181. list="${list}"$(echo "$data" | jq -r '.resourceDataCache[1].data.board_feed[].images."736x".url')
  182. fi
  183. for url in $list; do
  184. name=${url##*/}
  185. [ -f "$name" ] && continue
  186. curl -sk --retry 1 "$url" -o "$name" &
  187. done
  188. }
  189. gettumblr () {
  190. #Find single image link from Open Graph tag and download from a tumblr post
  191. url="$1"; url=${url#*//}; url=${url%\?*}; term="\"og:image\" content="
  192. url=$(curl -sLk --retry 1 "http://${url}" | grep -Po "${term}\".*?\"" | sed "s/${term}//g" | tr -d '"')
  193. [ -z "$url" ] && return
  194. ext=${url##*.}
  195. if [ "$gif" = 0 ]; then case "$ext" in gif|gifv|webm|mp4) return ;; esac; fi
  196. [ -f "${name}.${ext}" ] && return
  197. : $(( tally=tally+1 ))
  198. curl -sk --retry 1 "$url" -o "${name}.${ext}" &
  199. }
  200. getalbum () {
  201. #Downloads all images from an imgur album, if they are new and they match gif preference
  202. url="$1"; url=${url#*//}; url=${url%\?*}; term="item: "
  203. data=$(curl -sk --retry 1 "https://${url}" | grep -Po "${term}{.*}" | sed "s/${term}//g")
  204. [ -z "$data" ] && return
  205. base="http://i.imgur.com/"
  206. case "${url%/*}" in
  207. */a|*/gallery)
  208. c=$(echo "$data" | jq -r '.album_images.images | length')
  209. list=$(echo "$data" | jq -r '.album_images.images | map(.hash,.ext) | join(",")' | sed -e 's/,\./\./g' -e 's/,/\n/g')
  210. i=0; l=${#c}
  211. for img in $list; do
  212. img=${img%\?*}
  213. ext=${img##*.}
  214. if [ "$gif" = 0 ]; then case "$ext" in gif|gifv|webm|mp4) continue ;; esac; fi
  215. : $(( i=i+1 ))
  216. j=$(printf "%0${l}d" $i)
  217. if [ "$c" = 1 ]; then new="${name}.${ext}"
  218. else new="${name}-${j}.${ext}"; fi
  219. [ -f "$new" ] && continue
  220. url="${base}${img}"
  221. curl -sk --retry 1 "$url" -o "${new}" &
  222. done
  223. if [ "$i" -gt 0 ]; then : $(( tally=tally+1 )); fi ;;
  224. *)
  225. ext=$(echo "$data" | jq -r '.ext')
  226. ext=${ext%\?*}
  227. if [ "$gif" = 0 ]; then case "$ext" in .gif|.gifv|.webm|.mp4) return ;; esac; fi
  228. hash=$(echo "$data" | jq -r '.hash')
  229. url="${base}${hash}${ext}"
  230. curl -sk --retry 1 "$url" -o "${name}${ext}" &
  231. : $(( tally=tally+1 ))
  232. ;;
  233. esac
  234. }
  235. getuser () {
  236. user=$1
  237. if [ -z "$user" ]; then echo "No username given. Aborting."; exit 1; fi
  238. echo "Scraping ${target} posts from /u/${user} matching your criteria."
  239. echo
  240. user="author:${user}"
  241. getreddit "$user"
  242. }
  243. getsub () {
  244. sub=$1
  245. if [ -z "$sub" ]; then echo "No subreddit given. Aborting."; exit 1; fi
  246. echo "Scraping ${target} posts from /r/${sub} matching your criteria."
  247. echo
  248. sub="subreddit:${sub}"
  249. getreddit "$sub"
  250. }
  251. getreddit () {
  252. criteria=$1; page=0; max=1000; per=$max; before=$(date +%s); given=${criteria#*:}
  253. printf "Initialising... "
  254. api="https://elastic.pushshift.io/rs/submissions/_search/?sort=created_utc:desc&_source=url,permalink,created_utc"
  255. if ! ping -c1 elastic.pushshift.io > /dev/null 2>&1; then echo "pushshift.io API is down, try again later. Aborting."; exit 1; fi
  256. mkdir -p "$given"; cd "$given" || exit
  257. trap 'echo "Interrupted, removing temporary files and finishing early."; rm -f temp found; exit 1' INT
  258. find . -type f | sed -e 's/.*\///g' -e 's/[\.-].*//g' -e ':r;s/^0//g;tr' | sort -u > found
  259. if [ "$limit" -gt 0 ] && [ "$limit" -le "$per" ]; then per=$limit; fi
  260. echo "done."
  261. #Loop for each subreddit page
  262. until [ "$tally" -ge "$target" ] && [ "$target" -gt 0 ] || [ "$per" -le 0 ] || [ "$before" = "null" ]; do
  263. printf "Getting JSON data... "
  264. url="${api}&size=${per}&q=(${criteria}%20AND%20created_utc:%3C${before})"
  265. curl -sk --retry 1 "$url" -o temp
  266. list=$(jq -j '.hits.hits[]._source | .url + ",",(.permalink | split("/")[4] + "\n")' temp | grep -vf found | sed 's/amp;//g')
  267. [ "$gif" = 0 ] && list=$(echo "$list" | grep -v -e 'gfycat' -e '\.gif')
  268. before=$(jq -r ".hits.hits[$(( per-1 ))]._source.created_utc" temp)
  269. echo "done."
  270. printf "Searching page... "
  271. #Loop for each post
  272. for pair in $list; do
  273. if [ ! "$target" = 0 ] && [ "$tally" -ge "$target" ]; then break; fi
  274. name="${pair##*,}"
  275. [ "$name" = "$pair" ] && continue
  276. p=0; n=0; a=0; g=0; i=0; t=0; pi=0
  277. url=${pair%,*}; url=${url#*//}; url=${url#m.}
  278. ext=.${url##*.}; ext=${ext%\?*}
  279. case ${url%%/*} in
  280. *reddit.com) p=1; ext=".html" ;;
  281. i.reddituploads.com)
  282. ext=$(curl -skI "$url" | grep 'content-type:')
  283. ext=${ext##*/} ;;
  284. gfycat.com|www.gfycat.com) ext=".webm"; url="giant.${url}${ext}" ;;
  285. instagram.com|www.instagram.com) i=1; url=${url%\?*} ;;
  286. pinterest.com|pinterest.co.uk|www.pinterest.com|www.pinterest.co.uk) pi=1; url=${url%\?*} ;;
  287. *.tumblr.com) [ ${#ext} -gt 5 ] && t=1 ;;
  288. imgur.com|www.imgur.com)
  289. if [ ${#ext} -le 5 ]; then url="i.${url}";
  290. else a=1; fi ;;
  291. *) if [ "${url##*_}" = "d${ext}" ]; then url="${url%_*}${ext%\?*}"; fi ;;
  292. esac
  293. case "$ext" in
  294. .gif|.gifv|.webm|.mp4) g=1 ;;
  295. .jpg|.jpeg|.png|.webp|.svg) ;;
  296. .htm|.html|.php) n=1 ;;
  297. *)
  298. if [ "$a" = 0 ] && [ "$i" = 0 ] && [ "$t" = 0 ]; then n=1; fi
  299. ext="" ;;
  300. esac
  301. case "$only" in
  302. post) [ "$p" = 0 ] && continue ;;
  303. nonimage) [ "$n" = 0 ] && continue ;;
  304. gif) [ "$g" = 0 ] && continue ;;
  305. album) [ "$a" = 0 ] && continue ;;
  306. esac
  307. [ ${#name} -lt 6 ] && name=$(printf "%0$((6-${#name}))d$name") #Zero pad reddit IDs
  308. name="${name}${ext}"
  309. if [ "$p" -le "$post" ] && \
  310. [ "$n" -le "$nonimage" ] && \
  311. [ "$a" -le "$album" ] && \
  312. [ "$g" -le "$gif" ] && \
  313. [ "$i" -le "$insta" ] && \
  314. [ "$pi" -le "$pinterest" ] && \
  315. [ "$t" -le "$tumblr" ]; then
  316. if [ "$a" = 1 ]; then getalbum "$url"
  317. elif [ "$i" = 1 ]; then getinsta "$url"
  318. elif [ "$pi" = 1 ]; then getpinterest "$url"
  319. elif [ "$t" = 1 ]; then gettumblr "$url"
  320. else curl -sk --retry 1 "http://${url}" -o "$name" &
  321. : $(( tally=tally+1 ))
  322. fi
  323. fi
  324. done
  325. echo "${tally} posts found so far"
  326. if [ "$limit" -gt 0 ]; then
  327. : $(( page=page+1 ))
  328. [ $(( max*(page+1) )) -ge $limit ] && per=$(( limit-max*page ))
  329. fi
  330. done
  331. rm -f temp found
  332. printf "Finishing downloads... "
  333. wait
  334. cd ..
  335. echo
  336. echo "Saved content from ${tally} posts to $(pwd)/${given}"
  337. if [ "$per" -le 0 ] && [ "$tally" -lt "$target" ]; then
  338. echo "Reached the limit of posts to search and could not find enough posts matching your criteria."
  339. fi
  340. [ "$before" = "null" ] && echo "Ran out of posts in the subreddit to search, could not find enough posts."
  341. }
  342. if [ -e "$config" ] && [ ! "$1" = "-h" ] && [ ! "$1" = "--help" ] && [ ! "$1" = "" ]; then
  343. echo "Using configuration file found in ${config}..."
  344. fi
  345. case $1 in
  346. -s|--sub) getsub "$2" ;;
  347. -u|--user) getuser "$2" ;;
  348. -c|--clean) clean "$2" ;;
  349. -a|--album) getalbum "$url" ;;
  350. -i|--instagram) getinsta "$url" ;;
  351. -p|--pinterest) getpinterest "$url" ;;
  352. -t|--tumblr) gettumblr "$url" ;;
  353. -h|--help|'')
  354. echo "
  355. Usage: ${progname} [-s subreddit] [-u username] [-c directory] [-a URL] [-i URL] [-p URL] [-t URL] [-h]
  356. Download posts (eg. images) fitting certain criteria from reddit, and commonly linked sites
  357. Arguments:
  358. -h, --help Display this message and exit
  359. -s, --sub Scrape posts from the given subreddit
  360. -u, --user Scrape posts by the given user
  361. -c, --clean Tidy reposts/failures in a given directory
  362. -a, --album Download all images in an Imgur album at URL
  363. -i, --instagram Download images/videos from an Instagram account/post at URL
  364. -p, --pinterest Download images from a pinterest board/pin at URL
  365. -t, --tumblr Download all images in a Tumblr post at URL
  366. " ;;
  367. *) echo "Invalid option. Aborting."; exit 1 ;;
  368. esac
  369. if [ ! "$1" = "-h" ] && [ ! "$1" = "--help" ] && [ ! "$1" = "-c" ] && [ ! "$1" = "--clean" ] && [ ! "$1" = "" ]; then
  370. echo "...done."
  371. fi