123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708 |
- # NAME
- Scrappy - The All Powerful Web Spidering, Scraping, Creeping Crawling Framework
- # VERSION
- version 0.94112090
- # SYNOPSIS
- #!/usr/bin/perl
- use Scrappy;
- my $scraper = Scrappy->new;
-
- $scraper->crawl('http://search.cpan.org/recent',
- '/recent' => {
- '#cpansearch li a' => sub {
- print $_[1]->{href}, "\n";
- }
- }
- );
- And now manually, ... without crawl, the above is similar to the following ...
- #!/usr/bin/perl
- use Scrappy;
- my $scraper = Scrappy->new;
-
- if ($scraper->get($url)->page_loaded) {
- $scraper->select('#cpansearch li a')->each(sub{
- print shift->{href}, "\n";
- });
- }
- # DESCRIPTION
- Scrappy is an easy (and hopefully fun) way of scraping, spidering, and/or
- harvesting information from web pages, web services, and more. Scrappy is a
- feature rich, flexible, intelligent web automation tool.
- Scrappy (pronounced Scrap+Pee) == 'Scraper Happy' or 'Happy Scraper'; If you
- like you may call it Scrapy (pronounced Scrape+Pee) although Python has a web
- scraping framework by that name and this module is not a port of that one.
- ## FEATURES
- Scrappy provides a framework containing all the tools neccessary to create a
- simple yet powerful web scraper. At its core, Scrappy loads an array of
- features for access control, event logging, session handling, url matching,
- web request and response handling, proxy management, web scraping, and downloading.
- Futhermore, Scrappy provides a simple Moose-based plugin system that allows Scrappy
- to be easily extended.
- my $scraper = Scrappy->new;
-
- $scraper->control; # Scrappy::Scraper::Control (access control)
- $scraper->parser; # Scrappy::Scraper::Parser (web scraper)
- $scraper->user_agent; # Scrappy::Scraper::UserAgent (user-agent tools)
- $scraper->logger; # Scrappy::Logger (event logger)
- $scraper->queue; # Scrappy::Queue (flow control for loops)
- $scraper->session; # Scrappy::Session (session management)
- Please see the METHODS section for a more in-depth look at all Scrappy
- functionality.
- ## ATTRIBUTES
- The following is a list of object attributes available with every Scrappy instance,
- attributes always return an instance of the class they represent.
- ### content
- The content attribute holds the [HTTP::Response](http://search.cpan.org/perldoc?HTTP::Response) object of the current request.
- Returns undef if no page has been successfully fetched.
- my $scraper = Scrappy->new;
- $scraper->content;
- ### control
- The control attribute holds the [Scrappy::Scraper::Control](http://search.cpan.org/perldoc?Scrappy::Scraper::Control) object which is used
- the provide access conrtol to the scraper.
- my $scraper = Scrappy->new;
- $scraper->control;
-
- ... $scraper->control->restrict('google.com');
- ... $scraper->control->allow('cpan.org');
- ... if $scraper->control->is_allowed($url);
- ### debug
- The debug attribute holds a boolean which controls whether event logs are captured.
- my $scraper = Scrappy->new;
- $scraper->debug(1);
- ### logger
- The logger attribute holds the [Scrappy::Logger](http://search.cpan.org/perldoc?Scrappy::Logger) object which is used to provide
- event logging capabilities to the scraper.
- my $scraper = Scrappy->new;
- $scraper->logger;
- ### parser
- The parser attribute holds the [Scrappy::Scraper::Parser](http://search.cpan.org/perldoc?Scrappy::Scraper::Parser) object which is used
- to scrape html data from the specified source material.
- my $scraper = Scrappy->new;
- $scraper->parser;
- ### plugins
- The plugins attribute holds the [Scrappy::Plugin](http://search.cpan.org/perldoc?Scrappy::Plugin) object which is an interface
- used to load plugins.
- my $scraper = Scrappy->new;
- $scraper->plugins;
- ### queue
- The queue attribute holds the [Scrappy::Queue](http://search.cpan.org/perldoc?Scrappy::Queue) object which is used to provide
- flow-control for the standard loop approach to crawling.
- my $scraper = Scrappy->new;
- $scraper->queue;
- ### session
- The session attribute holds the [Scrappy::Session](http://search.cpan.org/perldoc?Scrappy::Session) object which is used to provide
- session support and persistent data across executions.
- my $scraper = Scrappy->new;
- $scraper->session;
- ### user_agent
- The user_agent attribute holds the [Scrappy::Scraper::UserAgent](http://search.cpan.org/perldoc?Scrappy::Scraper::UserAgent) object which is
- used to set and manipulate the user-agent header of the scraper.
- my $scraper = Scrappy->new;
- $scraper->user_agent;
- ### worker
- The worker attribute holds the [WWW::Mechanize](http://search.cpan.org/perldoc?WWW::Mechanize) object which is used navigate web
- pages and provide request and response header information.
- my $scraper = Scrappy->new;
- $scraper->worker;
- # METHODS
- ## back
- The back method is the equivalent of hitting the "back" button in a browser, it
- returns the previous page (response) and returns that URL, it will not backtrack
- beyond the first request.
- my $scraper = Scrappy->new;
-
- $scraper->get(...);
- ...
- $scraper->get(...);
- ...
- my $last_url = $scraper->back;
- ## cookies
- The cookies method returns an HTTP::Cookie object. Note! Cookies can be made
- persistent by enabling session-support. Session-support is enable by simply
- specifying a file to be used.
- my $scraper = Scrappy->new;
-
- $scraper->session->write('session.yml'); # enable session support
- $scraper->get(...);
- my $cookies = $scraper->cookies;
- ## crawl
- The crawl method is very useful when it is desired to crawl an entire website or
- at-least partially, it automates the tasks of creating a queue, fetching and
- parsing html pages, and establishing simple flow-control. See the SYNOPSIS for
- a simplified example, ... the following is a more complex example.
- my $scrappy = Scrappy->new;
-
- $scrappy->crawl('http://search.cpan.org/recent',
- '/recent' => {
- '#cpansearch li a' => sub {
- my ($self, $item) = @_;
- # follow all recent modules from search.cpan.org
- $self->queue->add($item->{href});
- }
- },
- '/~:author/:name-:version/' => {
- 'body' => sub {
- my ($self, $item, $args) = @_;
-
- my $reviews = $self
- ->select('.box table tr')->focus(3)->select('td.cell small a')
- ->data->[0]->{text};
-
- $reviews = $reviews =~ /\d+ Reviews/ ?
- $reviews : '0 reviews';
-
- print "found $args->{name} version $args->{version} ".
- "[$reviews] by $args->{author}\n";
- }
- }
- );
- ## domain
- The domain method returns the domain host of the current page. Local pages, e.g.
- file:///this/that/the_other will return undef.
- my $scraper = Scrappy->new;
-
- $scraper->get('http://www.google.com');
- print $scraper->domain; # print www.google.com
- ## download
- The download method is passed a URL, a Download Directory Path and a optionally
- a File Path, then it will follow the link and store the response contents into
- the specified file without leaving the current page. Basically it downloads the
- contents of the request (especially when the request pushes a file download). If
- a File Path is not specified, Scrappy will attempt to name the file automatically
- resorting to a random 6-charater string only if all else fails, then returns to
- the originating page.
- my $scaper = Scrappy->new;
- my $requested_url = '...';
-
- $scraper->download($requested_url, '/tmp');
-
- # supply your own file name
- $scraper->download($requested_url, '/tmp', 'somefile.txt');
- ## dumper
- The dumper method is a convenience feature that passes the passed-in objects to
- [Data::Dumper](http://search.cpan.org/perldoc?Data::Dumper) which in turn returns a stringified representation of that
- object/data-structure.
- my $scaper = Scrappy->new;
- my $requested_url = '...';
-
- $scraper->get($requested_url);
-
- my $data = $scraper->select('//a[@href]')->data;
-
- # print out the scraped data
- print $scraper->dumper($data);
- ## form
- The form method is used to submit a form on the current page.
- my $scraper = Scrappy->new;
-
- $scraper->form(fields => {
- username => 'mrmagoo',
- password => 'foobarbaz'
- });
-
- # or more specifically, for pages with multiple forms
-
- $scraper->form(form_name => 'login_form', fields => {
- username => 'mrmagoo',
- password => 'foobarbaz'
- });
-
- $scraper->form(form_number => 1, fields => {
- username => 'mrmagoo',
- password => 'foobarbaz'
- });
- ## get
- The get method takes a URL or URI object, fetches a web page and returns the
- Scrappy object.
- my $scraper = Scrappy->new;
-
- if ($scraper->get($new_url)->page_loaded) {
- ...
- }
-
- # $self->content has the HTTP::Response object
- ## log
- The log method logs an event with the event logger.
- my $scraper = Scrappy->new;
-
- $scraper->debug(1); # unneccessary, on by default
- $scraper->logger->verbose(1); # more detailed log
-
- $scraper->log('error', 'Somthing bad happened');
-
- ...
-
- $scraper->log('info', 'Somthing happened');
- $scraper->log('warn', 'Somthing strange happened');
- $scraper->log('coolness', 'Somthing cool happened');
- Note! Event logs are always recorded but never automatically written to a file
- unless explicitly told to do so using the following:
- $scraper->logger->write('log.yml');
- ## page_content_type
- The page_content_type method returns the content_type of the current page.
- my $scraper = Scrappy->new;
- $scraper->get('http://www.google.com/');
- print $scraper->page_content_type; # prints text/html
- ## page_data
- The page_data method returns the HTML content of the current page, additionally
- this method when passed a string with HTML markup, updates the content of the
- current page with that data and returns the modified content.
- my $scraper = Scrappy->new;
- $scraper->get(...);
- my $html = $scraper->page_data;
- ## page_ishtml
- The page_ishtml method returns true/false based on whether our content is HTML,
- according to the HTTP headers.
- my $scraper = Scrappy->new;
-
- $scraper->get($requested_url);
- if ($scraper->is_html) {
- ...
- }
- ## page_loaded
- The page_loaded method returns true/false based on whether the last request was
- successful.
- my $scraper = Scrappy->new;
-
- $scraper->get($requested_url);
- if ($scraper->page_loaded) {
- ...
- }
- ## page_match
- The page_match method checks the passed-in URL (or URL of the current page if
- left empty) against the URL pattern (route) defined. If URL is a match, it will
- return the parameters of that match much in the same way a modern web application
- framework processes URL routes.
- my $url = 'http://somesite.com/tags/awesomeness';
-
- ...
-
- my $scraper = Scrappy->new;
-
- # match against the current page
- my $this = $scraper->page_match('/tags/:tag');
- if ($this) {
- print $this->{'tag'};
- # ... prints awesomeness
- }
-
- .. or ..
-
- # match against a passed url
- my $this = $scraper->page_match('/tags/:tag', $url, {
- host => 'somesite.com'
- });
-
- if ($this) {
- print "This is the ", $this->{tag}, " page";
- # ... prints this is the awesomeness page
- }
- ## page_reload
- The page_reload method acts like the refresh button in a browser, it simply
- repeats the current request.
- my $scraper = Scrappy->new;
-
- $scraper->get(...);
- ...
- $scraper->reload;
- ## page_status
- The page_status method returns the 3-digit HTTP status code of the response.
- my $scraper = Scrappy->new;
- $scraper->get(...);
-
- if ($scraper->page_status == 200) {
- ...
- }
- ## page_text
- The page_text method returns a text representation of the last page having
- all HTML markup stripped.
- my $scraper = Scrappy->new;
- $scraper->get(...);
-
- my $text = $scraper->page_text;
- ## page_title
- The page_title method returns the content of the title tag if the current page
- is HTML, otherwise returns undef.
- my $scraper = Scrappy->new;
- $scraper->get('http://www.google.com/');
-
- my $title = $scraper->page_title;
- print $title; # print Google
- ## pause
- This method sets breaks between your requests in an attempt to simulate human
- interaction.
- my $scraper = Scrappy->new;
- $scraper->pause(20);
-
- $scraper->get($request_1);
- $scraper->get($request_2);
- $scraper->get($request_3);
- Given the above example, there will be a 20 sencond break between each request made,
- get, post, request, etc., You can also specify a range to have the pause method
- select from at random...
- $scraper->pause(5,20);
-
- $scraper->get($request_1);
- $scraper->get($request_2);
-
- # reset/turn it off
- $scraper->pause(0);
-
- print "I slept for ", ($scraper->pause), " seconds";
- Note! The download method is exempt from any automatic pausing.
- ## plugin
- The plugin method allow you to load a plugin. Using the appropriate case is
- recommended but not neccessary. See [Scrappy::Plugin](http://search.cpan.org/perldoc?Scrappy::Plugin) for more information.
- my $scraper = Scrappy->new;
-
- $scraper->plugin('foo_bar'); # will load Scrappy::Plugin::FooBar
- $scraper->plugin('foo-bar'); # will load Scrappy::Plugin::Foo::Bar
- $scraper->plugin('Foo::Bar'); # will load Scrappy::Plugin::Foo::Bar
-
- # more pratically
- $scraper->plugin('whois', 'spammer_check');
-
- ... somewhere in code
-
- my $var = $scraper->plugin_method();
- # example using core plugin Scrappy::Plugin::RandomProxy
-
- my $s = Scrappy->new;
-
- $s->plugin('random_proxy');
- $s->use_random_proxy;
-
- $s->get(...);
- ## post
- The post method takes a URL, a hashref of key/value pairs, and optionally an
- array of key/value pairs, and posts that data to the specified URL, then returns
- an HTTP::Response object.
- my $scraper = Scrappy->new;
- $scraper->post($requested_url, {
- input_a => 'value_a',
- input_b => 'value_b'
- });
-
- # w/additional headers
- my %headers = ('Content-Type' => 'multipart/form-data');
- $scraper->post($requested_url, {
- input_a => 'value_a',
- input_b => 'value_b'
- }, %headers);
- Note! The most common post headers for content-type are
- application/x-www-form-urlencoded and multipart/form-data.
- ## proxy
- The proxy method will set the proxy for the next request to be tunneled through.
- my $scraper = Scrappy->new;
-
- $scraper->proxy('http', 'http://proxy1.example.com:8000/');
- $scraper->get($requested_url);
-
- $scraper->proxy('http', 'ftp', 'http://proxy2.example.com:8000/');
- $scraper->get($requested_url);
-
- # best practice when using proxies
-
- use Tiny::Try;
-
- my $proxie = Scrappy->new;
-
- $proxie->proxy('http', 'http://proxy.example.com:8000/');
-
- try {
- $proxie->get($requested_url);
- } catch {
- die "Proxy failed\n";
- };
- Note! When using a proxy to perform requests, be aware that if they fail your
- program will die unless you wrap your code in an eval statement or use a try/catch
- mechanism. In the example above we use Tiny::Try to trap any errors that might occur
- when using proxy.
- ## request_denied
- The request_denied method is a simple shortcut to determine if the page you
- requested got loaded or redirected. This method is very useful on systems
- that require authentication and redirect if not authorized. This function
- return boolean, 1 if the current page doesn't match the requested page.
- my $scraper = Scrappy->new;
- $scraper->get($url_to_dashboard);
-
- if ($scraper->request_denied) {
- # do login, again
- }
- else {
- # resume ...
- }
- ## response
- The response method returns the HTTP::Repsonse object of the current page.
- my $scraper = Scrappy->new;
- $scraper->get(...);
- my $res = $scraper->response;
- ## select
- The select method takes XPATH or CSS selectors and returns a
- [Scrappy::Scraper::Parser](http://search.cpan.org/perldoc?Scrappy::Scraper::Parser) object which contains the matching elements.
- my $scraper = Scrappy->new;
-
- # return a list of links
- my $list = $scraper->select('#profile li a')->data; # see Scrappy::Scraper::Parser
-
- foreach my $link (@{$list}) {
- print $link->{href}, "\n";
- }
-
- # Zoom in on specific chunks of html code using the following ...
- my $list = $scraper
- ->select('#container table tr') # select all rows
- ->focus(4) # focus on the 5th row
- ->select('div div')->data;
-
- # The code above selects the div > div inside of the 5th tr in #container table
- # Access attributes html, text and other attributes as follows...
-
- $element = $scraper->select('table')->data->[0];
- $element->{html}; # HTML representation of the table
- $element->{text}; # Table stripped of all HTML
- $element->{cellpadding}; # cellpadding
- $element->{height}; # ...
- ## stash
- The stash method sets a stash (shared) variable or returns a reference to the entire
- stash object.
- my $scraper = Scrappy->new;
- $scraper->stash(age => 31);
-
- print 'stash access works'
- if $scraper->stash('age') == $scraper->stash->{age};
-
- my @array = (1..20);
- $scraper->stash(integers => [@array]);
- ## store
- The store method stores the contents of the current page into the specified file.
- If the content-type does not begin with 'text', the content is saved as binary data.
- my $scraper = Scrappy->new;
-
- $scraper->get($requested_url);
- $scraper->store('/tmp/foo.html');
- ## url
- The url method returns the complete URL for the current page.
- my $scraper = Scrappy->new;
- $scraper->get('http://www.google.com/');
- print $scraper->url; # prints http://www.google.com/
- # AUTHOR
- Al Newkirk <awncorp@cpan.org>
- # COPYRIGHT AND LICENSE
- This software is copyright (c) 2010 by awncorp.
- This is free software; you can redistribute it and/or modify it under
- the same terms as the Perl 5 programming language system itself.
|