You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This repo is archived. You can view files and clone it, but cannot push or open issues/pull-requests.

180 lines
4.8 KiB

/* Copyright (C) 2014 Alinson Xavier
*
* This file is part of Scholarium.
*
* Scholarium is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation, either version 3 of the License, or (at your option)
* any later version.
*
* This software is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this software. If not, see <http://www.gnu.org/licenses/>.
*/
function ScholarCrawler(parser, nodes, edges)
{
this.stack = [];
this.nodes = nodes;
this.edges = edges;
this.parser = parser;
this.delay = 1000;
this.minimum_citations = 0;
};
ScholarCrawler.prototype.article_to_node = function(article)
{
return {
id: article._id,
_id: article._id,
article: article,
group: 'standard',
label: '',
shape: 'dot',
mass: article.n_citations/2 + 1,
radius: 3*Math.pow(article.n_citations, 0.8) + 3,
title: "<span class='node_tooltip'>" + article.authors +
".<a href='' onclick='return open_external(\"" + article.url + "\")' target='_blank'> "
+ article.title + ".</a> " + article.source + ", " + article.year + ".</span>"
};
};
ScholarCrawler.prototype.add_citations = function(parent_node, levels)
{
assert(levels >= 0, "levels should be non-negative");
var crawler = this;
if(parent_node.is_dummy)
return this._add_citations_from_parser(parent_node, levels);
articles_db.findOne({_id: parent_node._id}, function(err, parent_article_db)
{
if(parent_article_db === null || !parent_article_db.is_cached)
crawler._add_citations_from_parser(parent_node, levels);
else
crawler._add_citations_from_db(crawler.article_to_node(parent_node.article), levels);
});
}
ScholarCrawler.prototype._add_citations_from_db = function(parent_node, levels)
{
assert(levels >= 0, "levels should be non-negative");
var crawler = this;
citations_db.find({to: parent_node.article._id}, function(err, citations)
{
citations.forEach(function(citation)
{
articles_db.findOne({_id: citation.from}, function(err, child_article)
{
crawler._add_child_article(child_article, parent_node, levels);
});
});
parent_node.group = "standard";
crawler.nodes.update(parent_node);
});
}
ScholarCrawler.prototype._add_citations_from_parser = function(parent_node, levels)
{
assert(levels >= 0, "levels should be non-negative");
var crawler = this;
this.parser.parse(parent_node.article.citations_url, function(child_articles)
{
child_articles.forEach(function(child_article)
{
articles_db.findOne({_id:child_article._id}, function(err, child_article_db)
{
if(child_article_db === null)
{
child_article.is_cached = false;
articles_db.insert(child_article);
crawler._add_child_article(child_article, parent_node, levels);
}
else
{
crawler._add_child_article(child_article_db, parent_node, levels);
}
});
});
if(!parent_node.is_dummy)
{
parent_node.group = "standard";
crawler.nodes.update(parent_node);
parent_node.article.is_cached = true;
articles_db.update({_id: parent_node.article._id}, parent_node.article, {});
}
});
}
ScholarCrawler.prototype._add_child_article = function(child_article, parent_node, levels)
{
assert(levels >= 0, "levels should be non-negative");
var child_node = this.article_to_node(child_article);
if(child_article.n_citations < this.minimum_citations)
return;
if(!parent_node.is_dummy)
{
edge = {
id: $.md5(child_node._id + parent_node._id),
from: child_node._id,
to: parent_node._id
};
edge._id = edge.id;
if(this.edges.getIds().indexOf(edge._id) < 0)
{
this.edges.add(edge);
citations_db.insert(edge);
}
}
if(child_article.n_citations > 0 && levels > 0)
this.push(child_node, levels-1);
if(this.nodes.getIds().indexOf(child_node._id) < 0)
this.nodes.add(child_node);
else
this.nodes.update(child_node);
}
ScholarCrawler.prototype.push = function(parent_node, levels)
{
assert(levels >= 0, "levels should be non-negative");
this.stack.push([parent_node, levels]);
if(!parent_node.is_dummy)
{
parent_node.group = "processing";
if(this.nodes.getIds().indexOf(parent_node._id) >= 0)
this.nodes.update(parent_node);
}
};
ScholarCrawler.prototype.start = function()
{
if(this.stack.length > 0)
{
var args = this.stack.pop();
this.add_citations(args[0], args[1]);
}
var crawler = this;
setTimeout(function() { crawler.start() }, 1000);
};