get_symbol_text 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. #!/bin/bash
  2. debug=0
  3. me="${0##*/}"
  4. # a little script to get useragent of currently installed FF
  5. #~ useragent="$(getua)" ||
  6. useragent="Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0"
  7. file=""
  8. # The divs we're searching for:
  9. query=tablecomponent
  10. URL="https://en.ilmatieteenlaitos.fi/weather-symbols" # english is default
  11. deps=( wget xmllint grep )
  12. usage() {
  13. [[ "$*" != "" ]] && echo "$*"
  14. cat <<EOF
  15. Download a web page that contains explanations for SmartSymbol weather symbols,
  16. and parse it.
  17. Dependencies: ${deps[@]}
  18. Usage: $me [options] [> file]
  19. Options:
  20. -l str Language: one of en, sv or fi (default: en)
  21. -U str Override user agent. Default:
  22. $useragent
  23. -d Debugging output
  24. -f str Parse local file. Invalidates other options. Mostly for debugging.
  25. EOF
  26. exit 1
  27. }
  28. __sep() {
  29. [[ "$*" != "" ]] && echo "############ $*" >&2
  30. echo "##################################################################" >&2
  31. }
  32. xpath() {
  33. # for xmllint to recognize broken html snippets as utf8-encoded:
  34. utf="<meta charset=\"utf-8\" />"
  35. echo "$utf$2" | xmllint --html --nonet --xpath "$1" - 2>/dev/null
  36. # xmllint creates loads of error messages even when it's working correctly
  37. # if you really want to see these messages, remove '2>/dev/null'
  38. }
  39. while getopts "f:l:U:hd" opt; do
  40. case $opt in
  41. f) [ -r "$OPTARG" ] && file="$OPTARG" || usage "cannot read $OPTARG"
  42. ;;
  43. l)
  44. case "$OPTARG" in
  45. en)
  46. ;;
  47. sv) URL="https://sv.ilmatieteenlaitos.fi/vadersymbolerna"
  48. ;;
  49. fi) URL="https://www.ilmatieteenlaitos.fi/saamerkkien-selitykset"
  50. ;;
  51. *) usage "Wrong language $OPTARG"
  52. ;;
  53. esac
  54. ;;
  55. U) useragent="$OPTARG"
  56. ;;
  57. d) debug=1
  58. ;;
  59. *) usage
  60. ;;
  61. esac
  62. done
  63. [[ "$debug" == 0 ]] && exec 2>/dev/null
  64. echo "User Agent: $useragent" >&2
  65. # simple dependency check
  66. for dep in "${deps[@]}"; do
  67. type -f $dep >/dev/null || usage
  68. done
  69. __sep "wgetting $URL or file"
  70. [[ "$file" == "" ]] && html="$(wget -U "$useragent" -O - "$URL")" || html="$(<"$file")"
  71. __sep "getting good stuff with xmllint"
  72. html="$(xpath "//div[@id=\"main-content\"]" "$html")"
  73. table=()
  74. i=0
  75. while read -r line; do
  76. if [[ "$line" == "$query" ]]; then
  77. ((i++))
  78. table[i]="$(xpath "(//div[@class=\"$query\"])[$i]" "$html")"
  79. __sep "table $i"
  80. echo "${table[i]}" >&2
  81. fi
  82. done <<<"$(grep -o -w "$query" <<<"$html")"
  83. __sep "Normal output (end result)"
  84. echo "Let's just assume that the first table is the only one we're interested in." >&2
  85. i=1
  86. longest=0
  87. #~ for ((i=1;i<=${#table[@]};i++)); do
  88. #~ echo "##################################"
  89. #~ echo "${table[i]}"
  90. IFS=$'\n' text=( $(xpath "//table/tbody/tr/td/img/@alt" "${table[i]}") )
  91. IFS=$'\n' icon=( $(xpath "//table/tbody/tr/td/img/@src" "${table[i]}") )
  92. #~ IFS=$'\n' text=( $(xpath "//table" "${table[i]}") )
  93. for ((j=0;j<${#text[@]};j++)); do
  94. s1="${icon[j]%\"*}"
  95. s1="${s1#*\"}"
  96. s1="${s1##*/}"
  97. s1="${s1%.*}"
  98. s2="${text[j]%\"*}"
  99. s2="${s2#*\"}"
  100. (( ${#s2} > longest )) && longest=${#s2}
  101. printf "%s:%s\n" "$s1" "$s2"
  102. done
  103. #~ done
  104. echo "longest:$longest"