12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758 |
- #!/bin/bash
- #
- ############################
- # #
- # DuckDuckGo Search Script #
- # #
- ############################
- #
- # A simple script that takes a textual argument, searches DuckDuckGo, and outputs
- # a series of links in a text document as DDGresults.txt, and to the command line
- #
- # Inititally created as a component of Category's "Chaffer", inspired by the book
- # "Little Brother" by Cory Doctorow - currently a WIP
- #
- # Store the provided argument in variable SEARCH
- SEARCH=$1
- # Replace any spaces with "+" characters
- SEARCH=${SEARCH// /+}
- # Use html version of DuckDuckGo to get results of the search in html format
- wget -O temp.html -q https://duckduckgo.com/html/?q=$SEARCH
- # Restrict to lines with a href link (generally most <a> tags)
- cat temp.html | grep href >> temp.html
- # Replace broken characters (":", "/", "-") with sed
- sed -i -e 's/%3A/:/g' temp.html
- sed -i -e 's#%2F#/#g' temp.html
- sed -i -e 's/%2D/-/g' temp.html
- # Using '#' as a delimiter for %2F, as replacing some with forward slashes.
- # Looks ugly, but works.
- # Remove DDG /l/.. shite
- sed -i -e 's#/l/?kh=-1&uddg=##g' temp.html
- # Output just the links between href and double-quote, and strip tracking crap
- cat temp.html | grep -Po '(?<=href=")[^"]*' | grep http > tmplinks.txt
- # Strip duplicates from tmplinks.txt, then record any non-DuckDuckGo links
- sort -u tmplinks.txt | grep -vi duckduckgo | grep -v duck.co > DDGresults.txt
- # Remove temporary files
- rm temp.html
- rm tmplinks.txt
- # Output list of links
- cat DDGresults.txt
|