I have built a ton of various scrapers, parsers and scripts in the past to get at all kinds of data on the web. I have mostly used PHP, Ruby and more recently Node.js (JavaScript).

When I saw a post on Bountify for a Scraper for Seeking Alpha I was curious. I wasn’t familiar with Seeking Alpha, but had a free night where my wife was at an event and I was at home watching the dog, so I figured why not give it a try.

After taking Seeking Alpha for a spin I knew I could parse the profiles, but scraping using their search was going to be more work than the bounty offered. Figuring that there was only going to be a couple of developers posting the answer if any, I figured why not do what I can in as little of time as possible.

Bountify

I simplified the code below but you can download the gist

var fs = require('fs'),
    request = require('request'),
    async = require('async'),
    _ = require('underscore'),
    cheerio = require('cheerio');

var outputCSV = [];
//******************************************************
// EACH CATGORY URL (Top 100)
// http://seekingalpha.com/leader-board/opinion-leaders
//******************************************************
var urls = [
"http://seekingalpha.com/opinion-leaders/long-ideas",
"http://seekingalpha.com/opinion-leaders/long-ideas/2",
"http://seekingalpha.com/opinion-leaders/long-ideas/3",
...
"http://seekingalpha.com/opinion-leaders/mutual-funds/3",
"http://seekingalpha.com/opinion-leaders/mutual-funds/4"
]

var authors = [];
async.forEach(urls,function(url,urlcb){
    request(url, function(error, response, body) {
      if (!error && response.statusCode == 200) {
        var $ = cheerio.load(body);
        ////*[@id="author_articles_wrapper"]/div/div[1]/ul
        $('.ld_top_list li a').each(function(i, elm) {          
          authors.push($(this).attr('href'));
        }); 
        $('.ld_more_list li a').each(function(i, elm) {          
          authors.push($(this).attr('href'));
        }); 
        console.log("FETCH: "+ url);
        urlcb();
      }else{
        console.log("Check Connection: Issue viewing: "+ url);
        console.log(error);
        process.exit(0);
      }
    });
},function(){
    var uniqAuthors = _.uniq(authors);
    async.forEach(uniqAuthors,function(name,pcb){
      var url = 'http://seekingalpha.com' + name;
      request(url, function(error, response, body) {
          if (!error && response.statusCode == 200) {
            console.log("Fetching: "+url);
            var $ = cheerio.load(body);
            var author = $("#author_full_name").text();
            var profile = $('.profile_item_mini_text').text().toLowerCase();
            // console.log(name+"|"+profile);
            //Check if keyword is in profile - save as boolean true or false
            var boolHedgeFund = contains(profile, 'hedge fund');
            var boolFormer = contains(profile, 'former');
            var boolEx = contains(profile, 'ex');
            var boolPortfolioManager = contains(profile, 'portfolio manager');

            var count=0;
            var keywords =[];
            if(boolHedgeFund){count++;keywords.push('hedge fund')}
            if(boolFormer){count++;keywords.push('former')}
            if(boolEx){count++;keywords.push('ex')}
            if(boolPortfolioManager){count++;keywords.push('portfolio manager')}

            var capture = {
              author:author,
              url:url,
              keywords:keywords,
              // profile: profile,
              count:count
            }
            if(count>0){
              outputCSV.push(capture);
              pcb();
            }else{
              pcb();
            }
          }else{
            pcb();
          }
      });
    },function(){
      console.log("DONE");
      var wstream = fs.createWriteStream('results.csv',{'flags': 'w'});
      wstream.write("Author,URL,Count,Keywords\n");
      async.forEach(outputCSV,function(a,acb){
        wstream.write(
            JSON.stringify(a.author)
            +","+a.url+","
            +a.count+","
            +JSON.stringify(a.keywords.join())
            +"\n"
        );
      },function(){
        wstream.end();
        console.log("DONE");
        process.exit(0);
      });
    })
})

function contains(r,s){ 
  return r.indexOf(s) !== -1; 
}