web scraping

Web scraping using node.js and third party libraries such as Phantom, Cheerio and jQuery.

There are 2 snippets with the web scraping tag

  • Web Scraping with node.js and cheerio

    Use node.js and the cheerio node module to scrape or extract data content from a web page. We use the HTTP core module in combination with cheerio, which converts an HTTP response into a DOM, which we use to parse and extract data from specific HTML elements.

    var http = require("http");
    var cheerio = require("cheerio");
    
    var server = http.createServer(function(req, res) {
    
      var req_opts = {
        host:"en.wikipedia.org",
        path:"/wiki/London"
      };
      var response_text = "";
    
      // 1. Perform an HTTP request to Wikipedia
      var request = http.request(req_opts, function(resp) {
        if(resp.statusCode != 200) {
          throw "Error: " + resp.statusCode; 
        };
        resp.setEncoding("utf8");
        resp.on("data", function (chunk) {
          response_text += chunk;
        });
        resp.on("end", function() {
    
          // 2. Parse response using cheerio
          $ = cheerio.load(response_text);
    
          // Begin writing our output HTML
          res.writeHead(200, {"Content-Type": "text/html"});
          res.write("<html><head><meta charset='UTF-8' />");
          res.write("</head><body><table>");
    
          // Iterate over TR elements in the Wikipedia infobox
          $("table.geography tr").each(function(tr_index, tr) {
            var th_text = $(this).find("th").text();
            var prop_name
              = th_text.trim().toLowerCase().replace(/[^a-z]/g,"");
    
            // We're only interested in these 3 fields
            if({"country":1,"mayor":1,"elevation":1}[prop_name])
            {
              // 3. Write out our tabulated data
              res.write("<tr><th>" + prop_name + "</th><td>");
              res.write($(this).find("td").text());
              res.write("</td></tr>");
            }
          });
    
          // And... we're done
          res.end("</table></body></html>");
        });
      });
    
      request.on("error", function(e) {
        throw "Error: " + e.message;
      });
    
      request.end();
    
    }).listen(8080);
    27 Jan 2014
  • Web Scraping with node.js and PhantomJS

    Use node.js with PhantomJS to scrape (or extract) data from a web page. We use the Phantom bridge and jQuery to fetch, open and parse a page from Wikipedia.

    var http = require("http");
    var phantom = require("phantom");
    
    var url = "http://en.wikipedia.org/wiki/London";
    
    var server = http.createServer(function(req, res) {
    
    phantom.create(function (ph) {
    ph.createPage(function (page) {
    page.open(url, function (status) {
    
    // We use jQuery to parse the document
    page.includeJs(
      "http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js",
      function() {
        page.evaluate(function() {
    
          var data = {};
    
          $("table.geography tr").each(function(tr_index, tr) {
            var th_text = $(this).find("th").text();
            var prop_name
              = th_text.trim().toLowerCase().replace(/[^a-z]/g,"");
    
            // We're only interested in these 3 fields
            if({"country":1,"mayor":1,"elevation":1}[prop_name]) {
              data[prop_name] = $(this).find("td").text();
            }
          });
    
          return data;
    
        }, function(data) {
    
          ph.exit();
    
          // Begin writing our output HTML
          res.writeHead(200, {"Content-Type": "text/html"});
          res.write("<html><head><meta charset='UTF-8' />");
          res.write("</head><body><table>");
    
          for(var prop in data) {
            res.write("<tr><th>" + prop + "</th><td>");
            res.write(data[prop]);
            res.write("</td></tr>");
          }
    
          res.end("</table></body></html>");
    
          process.exit(0);
        });
      }
    );
    
    });
    });
    });
    
    }).listen(8080);
    
    31 Jul 2014