123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 |
- #!/bin/bash
- base="https://visibleearth.nasa.gov"
- url="$base/view_cat.php?categoryID=1484&order=asc&sequence=data&p="
- # the bluemarble collection, sorted by data date, page to be added in loop
- dl_path="bluemarble"
- filetypes="" # by default, download everything
- notothers=0 # by default, download everything
- notmonths=0 # by default, download everything
- sep="========================="
- ################################################################################
- # FUNCTIONS
- usage() {
- bold="[1m"
- red="[38;5;1m"
- yellow="[38;5;3m"
- reset="[m(B"
- cat << EOF
- This script will retrieve images from the
- NASA Visible Earth - Blue Marble
- ================================
- collection found at this address:
- ${url%%&*}
- It will attempt to download ALL the image (and some animation) files found in
- the sub pages into a structure of subfolders under this base directory:
- ${bold}${yellow}$dl_path${reset}
- This can be changed via the ${bold}${red}-d${reset} option:
- ${bold}${yellow}$0 ${red}-d ${yellow}"path/to/files"${reset} # relative or absolute
- It is possible to restrict the filetypes downloaded by passing a list of
- extensions separated through any non-alphanumerical char, e.g.:
- ${bold}${yellow}$0 ${red}-t ${yellow}jpg:tif:png${reset} # case insensitive
- It is possible to NOT download the images for the 12 months (option ${bold}${red}-M${reset}) or
- to NOT download any other files (option ${bold}${red}-O${reset}).
- Downloading everything will take many hours even with a fast connection and
- resulted in an 84G download as of Feb 2019.
- E.g. restricting that to only JPG images for only the 12 months:
- ${bold}${yellow}$0 ${red}-t ${yellow}jpg ${red}-M${reset}
- resulted in a 14G download as of Feb 2019 and still took over an hour.
- If you interrupt and re-run the script later with the same base directory, it
- will not attempt to re-download already downloaded files.
- EOF
- }
- strip_href() {
- retval="${1#*\"}"
- echo "${retval%\"*}"
- }
- xpath() {
- xmllint --html --noent --nonet --xpath "$@" 2>/dev/null
- # xmllint creates loads of error messages even when it's working correctly
- # if you really want to see these messages, remove '2>/dev/null'
- }
- get_path() {
- case "$1" in
- *December*)
- [[ $notmonths == 1 ]] && return 1; echo "12/$(get_sub_path "$1")";;
- *January*)
- [[ $notmonths == 1 ]] && return 1; echo "01/$(get_sub_path "$1")";;
- *February*)
- [[ $notmonths == 1 ]] && return 1; echo "02/$(get_sub_path "$1")";;
- *March*)
- [[ $notmonths == 1 ]] && return 1; echo "03/$(get_sub_path "$1")";;
- *April*)
- [[ $notmonths == 1 ]] && return 1; echo "04/$(get_sub_path "$1")";;
- *May*)
- [[ $notmonths == 1 ]] && return 1; echo "05/$(get_sub_path "$1")";;
- *June*)
- [[ $notmonths == 1 ]] && return 1; echo "06/$(get_sub_path "$1")";;
- *July*)
- [[ $notmonths == 1 ]] && return 1; echo "07/$(get_sub_path "$1")";;
- *August*)
- [[ $notmonths == 1 ]] && return 1; echo "08/$(get_sub_path "$1")";;
- *September*)
- [[ $notmonths == 1 ]] && return 1; echo "09/$(get_sub_path "$1")";;
- *October*)
- [[ $notmonths == 1 ]] && return 1; echo "10/$(get_sub_path "$1")";;
- *November*)
- [[ $notmonths == 1 ]] && return 1; echo "11/$(get_sub_path "$1")";;
- *)
- [[ $notothers == 1 ]] && return 1; sanitize "$1";;
- esac
- }
- get_sub_path() {
- case "$1" in
- *"Topography and Bathymetry"*) echo "world.topo.bathy"
- ;;
- *Topography*) echo "world.topo"
- ;;
- *) echo "world"
- ;;
- esac
- }
- sanitize() {
- local sanitized_url="${1//[^a-zA-Z0-9-]/.}"
- echo "${sanitized_url#NASA.Visible.Earth..}"
- }
- ################################################################################
- # MAIN
- usage
- printf "$sep\nUser choices:\n"
- while getopts "t:d:OM" opt; do
- case $opt in
- t) filetypes="$OPTARG"
- echo "Filetypes chosen: $filetypes"
- ;;
- d) dl_path="$OPTARG"
- echo "Download path chosen: $PWD/$dl_path"
- ;;
- M) notmonths=1
- echo "Not downloading the images for the 12 Months"
- ;;
- O) notothers=1
- echo "Not downloading any Other images"
- ;;
- *) usage; exit 1
- ;;
- esac
- done
- printf "$sep\n"
- read -p "Press Enter to begin or Ctrl-c to abort... "
- # the index of the collection spans over several pages, currently three
- # loop: keep adding page numbers to the $url...
- for ((i=1;i<10;i++)); do
- printf "\n$sep$sep$sep\nPAGE $i: $url$i\n$sep$sep$sep\n"
- # from the grid of index thumbnails for subpages, extract the links
- oldifs="$IFS"
- IFS=$'\n'
- grid=(
- $(wget -qO - "$url$i" | xpath "//div[@class[contains(.,'main')]]/div[@class[contains(.,'col')]]/div[@class=\"caption\"]/a" -)
- )
- IFS="$oldifs"
- # ...until we don't get results anymore
- [[ "X${grid[@]}" == "X" ]] && break
- # for each image page link, get images
- for page in "${grid[@]}"; do
- title="$(xpath "//a/text()" - <<<"$page")"
- echo "Title: $title"
- img_page_link="$(xpath "//a/@href" - <<<"$page")"
- img_page_link="$(strip_href "$img_page_link")"
- path="$(get_path "$title")"
- # only continue if get_path returned a string (a subdirectory to create)
- if [[ "$path" != "" ]]; then
- printf "Downloadpath (subdirectory): $dl_path/$path\n$sep\n"
- img_page="$(wget -qO - "$base/${img_page_link}")"
- img_link=( $(xpath "//div[@id=\"visuals\"]/div/div/a/@href" - <<<"$img_page") )
- for dl in "${img_link[@]}"; do
- dl="$(strip_href "$dl")"
- if [[ "$filetypes" == "" ]] || [[ "${filetypes,,}" =~ (^|[^a-zA-Z0-9-])"${dl##*.}"($|[^a-zA-Z0-9-]) ]]; then
- if [ -r "$dl_path/$path/${dl##*/}" ]; then
- echo "${dl##*/} exists; skipping"
- else
- printf "File to download: ${dl##*/}\nwget says:\n"
- # finally, wget downloads the image into the subfolder calculated.
- # wget creates these folders as needed.
- wget -nc -nv -P "$dl_path/$path" "$dl"
- echo "$sep"
- fi
- fi
- done
- else printf "Nothing to do here.\n$sep\n"
- fi
- done
- done
- exit
|