strip.pl 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. #!/usr/bin/perl
  2. # Copyright (C) 2003, 2004, 2008 Alex Schroeder <alex@gnu.org>
  3. #
  4. # This program is free software; you can redistribute it and/or modify
  5. # it under the terms of the GNU General Public License as published by
  6. # the Free Software Foundation; either version 3 of the License, or
  7. # (at your option) any later version.
  8. #
  9. # This program is distributed in the hope that it will be useful,
  10. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. # GNU General Public License for more details.
  13. #
  14. # You should have received a copy of the GNU General Public License
  15. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. use CGI qw/:standard/;
  17. use CGI::Carp qw(fatalsToBrowser);
  18. use LWP::UserAgent;
  19. use Encode;
  20. use HTML::Parser;
  21. if (not param('url')) {
  22. print header(),
  23. start_html('Link Stripping'),
  24. h1('Link Stripping'),
  25. p('Transforms HTML into a plain-text list of link texts, one link per line.',
  26. 'For example, the HTML &lt;a href="foo.html">bar&lt;/a> will be transformed to',
  27. 'the text "bar".'),
  28. start_form(-method=>'GET'),
  29. p('HTML feed: ', textfield('url', '', 40), checkbox('latin-1'), submit()),
  30. end_form(),
  31. end_html();
  32. exit;
  33. }
  34. print header(-type=>'text/plain; charset=UTF-8');
  35. $ua = LWP::UserAgent->new;
  36. $request = HTTP::Request->new('GET', param('url'));
  37. $response = $ua->request($request);
  38. $data = $response->content;
  39. $data = encode('utf-8', decode('latin-1', $data)) if param('latin-1');
  40. $p = HTML::Parser->new(api_version => 3);
  41. $p->handler( start => \&start_handler, "tagname,self");
  42. %pages = ();
  43. $p->parse($data);
  44. $p->eof; # signal end of document
  45. print join("\n", sort keys %pages), "\n";
  46. sub start_handler {
  47. return if shift ne "a";
  48. my $self = shift;
  49. $self->handler(text => sub { $pages{(shift)} = 1 }, "dtext");
  50. $self->handler(end => sub { $self->handler(text => ""); });
  51. }