get_bluemarble 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. #!/bin/bash
  2. base="https://visibleearth.nasa.gov"
  3. url="$base/view_cat.php?categoryID=1484&order=asc&sequence=data&p="
  4. # the bluemarble collection, sorted by data date, page to be added in loop
  5. dl_path="bluemarble"
  6. filetypes="" # by default, download everything
  7. notothers=0 # by default, download everything
  8. notmonths=0 # by default, download everything
  9. sep="========================="
  10. ################################################################################
  11. # FUNCTIONS
  12. usage() {
  13. bold=""
  14. red=""
  15. yellow=""
  16. reset="(B"
  17. cat << EOF
  18. This script will retrieve images from the
  19. NASA Visible Earth - Blue Marble
  20. ================================
  21. collection found at this address:
  22. ${url%%&*}
  23. It will attempt to download ALL the image (and some animation) files found in
  24. the sub pages into a structure of subfolders under this base directory:
  25. ${bold}${yellow}$dl_path${reset}
  26. This can be changed via the ${bold}${red}-d${reset} option:
  27. ${bold}${yellow}$0 ${red}-d ${yellow}"path/to/files"${reset} # relative or absolute
  28. It is possible to restrict the filetypes downloaded by passing a list of
  29. extensions separated through any non-alphanumerical char, e.g.:
  30. ${bold}${yellow}$0 ${red}-t ${yellow}jpg:tif:png${reset} # case insensitive
  31. It is possible to NOT download the images for the 12 months (option ${bold}${red}-M${reset}) or
  32. to NOT download any other files (option ${bold}${red}-O${reset}).
  33. Downloading everything will take many hours even with a fast connection and
  34. resulted in an 84G download as of Feb 2019.
  35. E.g. restricting that to only JPG images for only the 12 months:
  36. ${bold}${yellow}$0 ${red}-t ${yellow}jpg ${red}-M${reset}
  37. resulted in a 14G download as of Feb 2019 and still took over an hour.
  38. If you interrupt and re-run the script later with the same base directory, it
  39. will not attempt to re-download already downloaded files.
  40. EOF
  41. }
  42. strip_href() {
  43. retval="${1#*\"}"
  44. echo "${retval%\"*}"
  45. }
  46. xpath() {
  47. xmllint --html --noent --nonet --xpath "$@" 2>/dev/null
  48. # xmllint creates loads of error messages even when it's working correctly
  49. # if you really want to see these messages, remove '2>/dev/null'
  50. }
  51. get_path() {
  52. case "$1" in
  53. *December*)
  54. [[ $notmonths == 1 ]] && return 1; echo "12/$(get_sub_path "$1")";;
  55. *January*)
  56. [[ $notmonths == 1 ]] && return 1; echo "01/$(get_sub_path "$1")";;
  57. *February*)
  58. [[ $notmonths == 1 ]] && return 1; echo "02/$(get_sub_path "$1")";;
  59. *March*)
  60. [[ $notmonths == 1 ]] && return 1; echo "03/$(get_sub_path "$1")";;
  61. *April*)
  62. [[ $notmonths == 1 ]] && return 1; echo "04/$(get_sub_path "$1")";;
  63. *May*)
  64. [[ $notmonths == 1 ]] && return 1; echo "05/$(get_sub_path "$1")";;
  65. *June*)
  66. [[ $notmonths == 1 ]] && return 1; echo "06/$(get_sub_path "$1")";;
  67. *July*)
  68. [[ $notmonths == 1 ]] && return 1; echo "07/$(get_sub_path "$1")";;
  69. *August*)
  70. [[ $notmonths == 1 ]] && return 1; echo "08/$(get_sub_path "$1")";;
  71. *September*)
  72. [[ $notmonths == 1 ]] && return 1; echo "09/$(get_sub_path "$1")";;
  73. *October*)
  74. [[ $notmonths == 1 ]] && return 1; echo "10/$(get_sub_path "$1")";;
  75. *November*)
  76. [[ $notmonths == 1 ]] && return 1; echo "11/$(get_sub_path "$1")";;
  77. *)
  78. [[ $notothers == 1 ]] && return 1; sanitize "$1";;
  79. esac
  80. }
  81. get_sub_path() {
  82. case "$1" in
  83. *"Topography and Bathymetry"*) echo "world.topo.bathy"
  84. ;;
  85. *Topography*) echo "world.topo"
  86. ;;
  87. *) echo "world"
  88. ;;
  89. esac
  90. }
  91. sanitize() {
  92. local sanitized_url="${1//[^a-zA-Z0-9-]/.}"
  93. echo "${sanitized_url#NASA.Visible.Earth..}"
  94. }
  95. ################################################################################
  96. # MAIN
  97. usage
  98. printf "$sep\nUser choices:\n"
  99. while getopts "t:d:OM" opt; do
  100. case $opt in
  101. t) filetypes="$OPTARG"
  102. echo "Filetypes chosen: $filetypes"
  103. ;;
  104. d) dl_path="$OPTARG"
  105. echo "Download path chosen: $PWD/$dl_path"
  106. ;;
  107. M) notmonths=1
  108. echo "Not downloading the images for the 12 Months"
  109. ;;
  110. O) notothers=1
  111. echo "Not downloading any Other images"
  112. ;;
  113. *) usage; exit 1
  114. ;;
  115. esac
  116. done
  117. printf "$sep\n"
  118. read -p "Press Enter to begin or Ctrl-c to abort... "
  119. # the index of the collection spans over several pages, currently three
  120. # loop: keep adding page numbers to the $url...
  121. for ((i=1;i<10;i++)); do
  122. printf "\n$sep$sep$sep\nPAGE $i: $url$i\n$sep$sep$sep\n"
  123. # from the grid of index thumbnails for subpages, extract the links
  124. oldifs="$IFS"
  125. IFS=$'\n'
  126. grid=(
  127. $(wget -qO - "$url$i" | xpath "//div[@class[contains(.,'main')]]/div[@class[contains(.,'col')]]/div[@class=\"caption\"]/a" -)
  128. )
  129. IFS="$oldifs"
  130. # ...until we don't get results anymore
  131. [[ "X${grid[@]}" == "X" ]] && break
  132. # for each image page link, get images
  133. for page in "${grid[@]}"; do
  134. title="$(xpath "//a/text()" - <<<"$page")"
  135. echo "Title: $title"
  136. img_page_link="$(xpath "//a/@href" - <<<"$page")"
  137. img_page_link="$(strip_href "$img_page_link")"
  138. path="$(get_path "$title")"
  139. # only continue if get_path returned a string (a subdirectory to create)
  140. if [[ "$path" != "" ]]; then
  141. printf "Downloadpath (subdirectory): $dl_path/$path\n$sep\n"
  142. img_page="$(wget -qO - "$base/${img_page_link}")"
  143. img_link=( $(xpath "//div[@id=\"visuals\"]/div/div/a/@href" - <<<"$img_page") )
  144. for dl in "${img_link[@]}"; do
  145. dl="$(strip_href "$dl")"
  146. if [[ "$filetypes" == "" ]] || [[ "${filetypes,,}" =~ (^|[^a-zA-Z0-9-])"${dl##*.}"($|[^a-zA-Z0-9-]) ]]; then
  147. if [ -r "$dl_path/$path/${dl##*/}" ]; then
  148. echo "${dl##*/} exists; skipping"
  149. else
  150. printf "File to download: ${dl##*/}\nwget says:\n"
  151. # finally, wget downloads the image into the subfolder calculated.
  152. # wget creates these folders as needed.
  153. wget -nc -nv -P "$dl_path/$path" "$dl"
  154. echo "$sep"
  155. fi
  156. fi
  157. done
  158. else printf "Nothing to do here.\n$sep\n"
  159. fi
  160. done
  161. done
  162. exit