Project.pm.bak 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. package Scrappy::Project;
  2. BEGIN {
  3. $Scrappy::Project::VERSION = '0.94112090';
  4. }
  5. use Carp;
  6. use File::Find::Rule;
  7. use Scrappy;
  8. use Moose::Role;
  9. has app => (
  10. is => 'ro',
  11. isa => 'Any',
  12. default => sub {
  13. my $self = shift;
  14. $self->scraper(Scrappy->new);
  15. my $meta = $self->meta;
  16. return $meta->has_method('setup') ? $self->setup : $self;
  17. }
  18. );
  19. has parsers => (
  20. is => 'ro',
  21. isa => 'Any',
  22. default => sub {
  23. my $self = shift;
  24. my $class = ref $self;
  25. my @parsers = ();
  26. $class =~ s/::/\//g;
  27. my @files =
  28. File::Find::Rule->file()->name('*.pm')->in(map {"$_/$class"} @INC);
  29. my %parsers =
  30. map { $_ => 1 } @files; #uniquenes
  31. for my $parser (keys %parsers) {
  32. my ($plug) = $parser =~ /($class\/.*)\.pm/;
  33. if ($plug) {
  34. $plug =~ s/\//::/g;
  35. push @parsers, $plug;
  36. }
  37. }
  38. return [@parsers];
  39. }
  40. );
  41. has registry => (
  42. is => 'ro',
  43. isa => 'HashRef',
  44. default => sub {
  45. # map parsers
  46. my $parsers = {};
  47. my @parsers = @{shift->parsers};
  48. foreach my $parser (@parsers) {
  49. $parsers->{$parser} = $parser;
  50. $parsers->{lc($parser)} = $parser;
  51. }
  52. return $parsers;
  53. }
  54. );
  55. has records => (
  56. is => 'rw',
  57. isa => 'HashRef',
  58. default => sub { {} }
  59. );
  60. has routes => (
  61. is => 'rw',
  62. isa => 'HashRef',
  63. default => sub { {} }
  64. );
  65. has scraper => (
  66. is => 'rw',
  67. isa => 'Scrappy'
  68. );
  69. sub route {
  70. my $self = shift;
  71. my $options = {};
  72. # basic definition
  73. ($options->{route}, $options->{parser}) = @_ if scalar @_ == 2;
  74. # odd definition
  75. if (@_ % 2) {
  76. my $route = shift;
  77. $options = {@_};
  78. $options->{route} = $route;
  79. }
  80. # check route and parser spec
  81. die "Error defining route, must have a route and parser assignment"
  82. unless $options->{route} && $options->{parser};
  83. # covert parser from shortcut if used
  84. if ($options->{parser} !~ ref($self) . "::") {
  85. my $parser = $options->{parser};
  86. # make fully-quaified parser name
  87. $parser = ucfirst $parser;
  88. $parser = join("::", map(ucfirst, split '-', $parser))
  89. if $parser =~ /\-/;
  90. $parser = join("", map(ucfirst, split '_', $parser))
  91. if $parser =~ /\_/;
  92. $options->{parser} = ref($self) . "::$parser";
  93. }
  94. # find action if not specified
  95. #unless ( defined $options->{action} ) {
  96. # my ($action) = $options->{parser} =~ /\#(.*)$/;
  97. # $options->{parser} =~ s/\#(.*)$//;
  98. # $options->{action} = $action;
  99. #}
  100. $self->routes->{$options->{route}} = $options;
  101. delete $self->routes->{$options->{route}}->{route};
  102. return $self;
  103. }
  104. sub parse_document {
  105. my ($self, $url) = @_;
  106. my $scraper = $self->scraper;
  107. croak("Unable to fetch document, URL is not defined") unless $url;
  108. croak("Can't parse document, No routes defined")
  109. unless keys %{$self->routes};
  110. # try to match against route(s)
  111. foreach my $route (keys %{$self->routes}) {
  112. my $this = $scraper->page_match($route, $url);
  113. if ($this) {
  114. my $parser = $self->routes->{$route}->{parser};
  115. #my $action = $self->routes->{$route}->{action};
  116. no warnings 'redefine';
  117. no strict 'refs';
  118. my $module = $parser;
  119. $module =~ s/::/\//g;
  120. $module = "$module.pm";
  121. require $module;
  122. my $new = $parser->new;
  123. $new->scraper($scraper);
  124. $self->records->{$route} = []
  125. unless defined $self->records->{$route};
  126. my $record = $new->parse($this);
  127. push @{$self->records->{$route}}, $record;
  128. return $record;
  129. }
  130. }
  131. return 0;
  132. }
  133. sub crawl {
  134. my ($class, $starting_url) = @_;
  135. my $self = ref $class ? $class : $class->new;
  136. croak("Error, can't execute without a starting url") unless $starting_url;
  137. my $q = $self->scraper->queue;
  138. $q->add($starting_url);
  139. while (my $url = $q->next) {
  140. # parse document data
  141. $self->scraper->get($url);
  142. $self->parse_document($url)
  143. if $self->scraper->page_loaded
  144. && $self->scraper->page_ishtml
  145. && $self->scraper->page_status == 200;
  146. }
  147. return $self->records;
  148. }
  149. 1;