phantomjs snippets

How to scrape web pages with PhantomJS and jQuery

Tagged phantomjs, scrape, jquery  Languages javascript

This is an example of how to scrape the web using PhantomJS and jQuery:

var page = new WebPage(),
    url = 'http://localhost/a-search-form',
    stepIndex = 0;

/**
 * From PhantomJS documentation:
 * This callback is invoked when there is a JavaScript console. The callback may accept up to three arguments: 
 * the string for the message, the line number, and the source identifier.
 */
page.onConsoleMessage = function (msg, line, source) {
    console.log('console> ' + msg);
};

/**
 * From PhantomJS documentation:
 * This callback is invoked when there is a JavaScript alert. The only argument passed to the callback is the string for the message.
 */
page.onAlert = function (msg) {
    console.log('alert!!> ' + msg);
};

// Callback is executed each time a page is loaded...
page.open(url, function (status) {
  if (status === 'success') {
    // State is initially empty. State is persisted between page loads and can be used for identifying which page we're on.
    console.log('============================================');
    console.log('Step "' + stepIndex + '"');
    console.log('============================================');

    // Inject jQuery for scraping (you need to save jquery-1.6.1.min.js in the same folder as this file)
    page.injectJs('jquery-1.6.1.min.js');

    // Our "event loop"
    if(!phantom.state){
      initialize();
    } else {
      phantom.state();
    } 

    // Save screenshot for debugging purposes
    page.render("step" + stepIndex++ + ".png");
  }
});

// Step 1
function initialize() {
  page.evaluate(function() {
    $('form#search input.query').val('Jebus saves');
    $('form#search').submit();
    console.log('Searching...');
  });
  // Phantom state doesn't change between page reloads
  // We use the state to store the search result handler, ie. the next step
  phantom.state = parseResults; 
}

// Step 2
function parseResults() {
  page.evaluate(function() {
    $('#search-result a').each(function(index, link) {
      console.log($(link).attr('href'));
    })
    console.log('Parsed results');
  });
  // If there was a 3rd step we could point to another function
  // but we would have to reload the page for the callback to be called again
  phantom.exit(); 
}

Scraping and crawling a website with pjscrape and PhantomJS

Tagged phantomjs, pjscrape  Languages javascript
pjs.addSuite({
  url: 'http://google.com',
  maxDepth: 1,
  loadScript: ['underscore-min.js'],
  ignoreDuplicates: true,
  moreUrls: function() {
    var urls = _pjs.getAnchorUrls('a', false);
    var result = _.filter(urls, function (url) {
      return url.indexOf('javascript') == -1
    });
    console.log("Found " + urls.length + " urls. Using " + result.length)
    return result
  },
  scraper: function() {
    var result = []
    var links = $('a')
    links = links.map(function(index, elem) { 
      return $(elem).text()
    }).toArray()
    result = result.concat(links)
    return result
  }
});

How to use PhantomJS to take screenshots

Tagged phantomjs, screenshot  Languages javascript
page = new WebPage()
if phantom.args.length < 2 or phantom.args.length > 3
  console.log "Usage: phantomjs screenshot.coffee URL filename"
  phantom.exit()
else
  address = phantom.args[0]
  output = phantom.args[1]
  ua = "User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.101 Safari/537.11"
  page.settings.userAgent = ua
  page.customHeaders = "Accept-Language": "sv-SE,sv;q=0.8,en-US;q=0.6,en;q=0.4"
  page.viewportSize =
    width: 1024
    height: 760

  page.open address, (status) ->
    if status isnt "success"
      console.log "Unable to load the address!"
      phantom.exit()
    else
      window.setTimeout (->
        page.clipRect =
          top: 0
          left: 0
          width: 1024
          height: 760

        page.render output
        console.log "Exiting"
        phantom.exit()
      ), 200

Usage:

screenshot.coffee http://google.com google.png