123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194 |
- package Scrappy::Project;
- BEGIN {
- $Scrappy::Project::VERSION = '0.94112090';
- }
- use Carp;
- use File::Find::Rule;
- use Scrappy;
- use Moose::Role;
- has app => (
- is => 'ro',
- isa => 'Any',
- default => sub {
- my $self = shift;
- $self->scraper(Scrappy->new);
- my $meta = $self->meta;
- return $meta->has_method('setup') ? $self->setup : $self;
- }
- );
- has parsers => (
- is => 'ro',
- isa => 'Any',
- default => sub {
- my $self = shift;
- my $class = ref $self;
- my @parsers = ();
- $class =~ s/::/\//g;
- my @files =
- File::Find::Rule->file()->name('*.pm')->in(map {"$_/$class"} @INC);
- my %parsers =
- map { $_ => 1 } @files; #uniquenes
- for my $parser (keys %parsers) {
- my ($plug) = $parser =~ /($class\/.*)\.pm/;
- if ($plug) {
- $plug =~ s/\//::/g;
- push @parsers, $plug;
- }
- }
- return [@parsers];
- }
- );
- has registry => (
- is => 'ro',
- isa => 'HashRef',
- default => sub {
- # map parsers
- my $parsers = {};
- my @parsers = @{shift->parsers};
- foreach my $parser (@parsers) {
- $parsers->{$parser} = $parser;
- $parsers->{lc($parser)} = $parser;
- }
- return $parsers;
- }
- );
- has records => (
- is => 'rw',
- isa => 'HashRef',
- default => sub { {} }
- );
- has routes => (
- is => 'rw',
- isa => 'HashRef',
- default => sub { {} }
- );
- has scraper => (
- is => 'rw',
- isa => 'Scrappy'
- );
- sub route {
- my $self = shift;
- my $options = {};
- # basic definition
- ($options->{route}, $options->{parser}) = @_ if scalar @_ == 2;
- # odd definition
- if (@_ % 2) {
- my $route = shift;
- $options = {@_};
- $options->{route} = $route;
- }
- # check route and parser spec
- die "Error defining route, must have a route and parser assignment"
- unless $options->{route} && $options->{parser};
- # covert parser from shortcut if used
- if ($options->{parser} !~ ref($self) . "::") {
- my $parser = $options->{parser};
- # make fully-quaified parser name
- $parser = ucfirst $parser;
- $parser = join("::", map(ucfirst, split '-', $parser))
- if $parser =~ /\-/;
- $parser = join("", map(ucfirst, split '_', $parser))
- if $parser =~ /\_/;
- $options->{parser} = ref($self) . "::$parser";
- }
- # find action if not specified
- #unless ( defined $options->{action} ) {
- # my ($action) = $options->{parser} =~ /\#(.*)$/;
- # $options->{parser} =~ s/\#(.*)$//;
- # $options->{action} = $action;
- #}
- $self->routes->{$options->{route}} = $options;
- delete $self->routes->{$options->{route}}->{route};
- return $self;
- }
- sub parse_document {
- my ($self, $url) = @_;
- my $scraper = $self->scraper;
- croak("Unable to fetch document, URL is not defined") unless $url;
- croak("Can't parse document, No routes defined")
- unless keys %{$self->routes};
- # try to match against route(s)
- foreach my $route (keys %{$self->routes}) {
- my $this = $scraper->page_match($route, $url);
- if ($this) {
- my $parser = $self->routes->{$route}->{parser};
- #my $action = $self->routes->{$route}->{action};
- no warnings 'redefine';
- no strict 'refs';
- my $module = $parser;
- $module =~ s/::/\//g;
- $module = "$module.pm";
- require $module;
- my $new = $parser->new;
- $new->scraper($scraper);
- $self->records->{$route} = []
- unless defined $self->records->{$route};
- my $record = $new->parse($this);
- push @{$self->records->{$route}}, $record;
- return $record;
- }
- }
-
- return 0;
- }
- sub crawl {
- my ($class, $starting_url) = @_;
- my $self = ref $class ? $class : $class->new;
- croak("Error, can't execute without a starting url") unless $starting_url;
- my $q = $self->scraper->queue;
- $q->add($starting_url);
- while (my $url = $q->next) {
- # parse document data
- $self->scraper->get($url);
- $self->parse_document($url)
- if $self->scraper->page_loaded
- && $self->scraper->page_ishtml
- && $self->scraper->page_status == 200;
- }
-
- return $self->records;
- }
- 1;
|