scrape.sh 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. #!/bin/sh
  2. # https://mro.name/radio-privatkopie
  3. #
  4. # Scrape broadcasts on air the next 90 minutes.
  5. #
  6. # Calls ../../../app/broadcast-store.sh to write file, match podcasts etc.
  7. #
  8. # Called from ../../../app/cron/hourly.sh
  9. #
  10. cd "$(dirname "${0}")" || exit 1
  11. readonly cmd="./html2broadcast-br-$(uname -s)-$(uname -m)"
  12. readonly tmin="$(date +'%FT%T')"
  13. if [ "12:30" = "$(date -d '11:00 today + 90 minutes' +'%H:%M' 2>/dev/null)" ] ; then
  14. # looks like a Linux
  15. readonly tmax="$(date -d 'now + 120 minutes' +'%FT%T')"
  16. else
  17. # hope it's a BSD
  18. readonly tmax="$(date -v+120M +'%FT%T')"
  19. fi
  20. . ./etc.sh
  21. for ur in $(curl \
  22. --connect-timeout 5 \
  23. --location \
  24. --silent \
  25. --url "${PROGRAM_URL}" \
  26. --user-agent "https://mro.name/radio-privatkopie" \
  27. | jq ".channelBroadcasts[] | select(\"${tmin}\" <= .broadcastStartDate and .broadcastStartDate <= \"${tmax}\") | .broadcastHtml" \
  28. | grep -hoE '/[^" ]+/programmkalender/ausstrahlung-[0-9]+\.html' )
  29. do
  30. url="https://br.de${ur}"
  31. # echo "GET ${url}"
  32. tmp_html="$(mktemp /tmp/radio-scrape.XXXXXX)"
  33. trap "rm -f ${tmp_html}" EXIT INT HUP QUIT TERM ALRM USR1
  34. curl \
  35. --connect-timeout 10 \
  36. --location \
  37. --output "${tmp_html}" \
  38. --silent \
  39. --url "${url}" \
  40. --user-agent "https://mro.name/radio-privatkopie" \
  41. || {
  42. echo "failed: GET ${url}"
  43. rm -f "${tmp_html}"
  44. continue
  45. }
  46. "${cmd}" < "${tmp_html}" \
  47. | sh "../../../app/broadcast-store.sh" "${url}"
  48. rm -f "${tmp_html}"
  49. done
  50. wait