123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125 |
- #!/bin/bash
- debug=0
- me="${0##*/}"
- # a little script to get useragent of currently installed FF
- #~ useragent="$(getua)" ||
- useragent="Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0"
- file=""
- # The divs we're searching for:
- query=tablecomponent
- URL="https://en.ilmatieteenlaitos.fi/weather-symbols" # english is default
- deps=( wget xmllint grep )
- usage() {
- [[ "$*" != "" ]] && echo "$*"
- cat <<EOF
- Download a web page that contains explanations for SmartSymbol weather symbols,
- and parse it.
- Dependencies: ${deps[@]}
- Usage: $me [options] [> file]
- Options:
- -l str Language: one of en, sv or fi (default: en)
- -U str Override user agent. Default:
- $useragent
- -d Debugging output
- -f str Parse local file. Invalidates other options. Mostly for debugging.
- EOF
- exit 1
- }
- __sep() {
- [[ "$*" != "" ]] && echo "############ $*" >&2
- echo "##################################################################" >&2
- }
- xpath() {
- # for xmllint to recognize broken html snippets as utf8-encoded:
- utf="<meta charset=\"utf-8\" />"
- echo "$utf$2" | xmllint --html --nonet --xpath "$1" - 2>/dev/null
- # xmllint creates loads of error messages even when it's working correctly
- # if you really want to see these messages, remove '2>/dev/null'
- }
- while getopts "f:l:U:hd" opt; do
- case $opt in
- f) [ -r "$OPTARG" ] && file="$OPTARG" || usage "cannot read $OPTARG"
- ;;
- l)
- case "$OPTARG" in
- en)
- ;;
- sv) URL="https://sv.ilmatieteenlaitos.fi/vadersymbolerna"
- ;;
- fi) URL="https://www.ilmatieteenlaitos.fi/saamerkkien-selitykset"
- ;;
- *) usage "Wrong language $OPTARG"
- ;;
- esac
- ;;
- U) useragent="$OPTARG"
- ;;
- d) debug=1
- ;;
- *) usage
- ;;
- esac
- done
- [[ "$debug" == 0 ]] && exec 2>/dev/null
- echo "User Agent: $useragent" >&2
- # simple dependency check
- for dep in "${deps[@]}"; do
- type -f $dep >/dev/null || usage
- done
- __sep "wgetting $URL or file"
- [[ "$file" == "" ]] && html="$(wget -U "$useragent" -O - "$URL")" || html="$(<"$file")"
- __sep "getting good stuff with xmllint"
- html="$(xpath "//div[@id=\"main-content\"]" "$html")"
- table=()
- i=0
- while read -r line; do
- if [[ "$line" == "$query" ]]; then
- ((i++))
- table[i]="$(xpath "(//div[@class=\"$query\"])[$i]" "$html")"
- __sep "table $i"
- echo "${table[i]}" >&2
- fi
- done <<<"$(grep -o -w "$query" <<<"$html")"
- __sep "Normal output (end result)"
- echo "Let's just assume that the first table is the only one we're interested in." >&2
- i=1
- longest=0
- #~ for ((i=1;i<=${#table[@]};i++)); do
- #~ echo "##################################"
- #~ echo "${table[i]}"
- IFS=$'\n' text=( $(xpath "//table/tbody/tr/td/img/@alt" "${table[i]}") )
- IFS=$'\n' icon=( $(xpath "//table/tbody/tr/td/img/@src" "${table[i]}") )
- #~ IFS=$'\n' text=( $(xpath "//table" "${table[i]}") )
- for ((j=0;j<${#text[@]};j++)); do
- s1="${icon[j]%\"*}"
- s1="${s1#*\"}"
- s1="${s1##*/}"
- s1="${s1%.*}"
- s2="${text[j]%\"*}"
- s2="${s2#*\"}"
- (( ${#s2} > longest )) && longest=${#s2}
- printf "%s:%s\n" "$s1" "$s2"
- done
- #~ done
- echo "longest:$longest"
|