-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
105 lines (85 loc) · 2.3 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"use strict";
var _ = require('lodash');
var request = require('request');
var cheerio = require('cheerio');
var natural = require('natural');
var tfidf = new natural.TfIdf();
var logger = console;
var argv = require('yargs')
.usage('Usage: node $0 <url> [options]')
.demand(1)
.example('node $0 http://google.com')
.argv;
function links($, txt, meta){
var tag = 'a';
var $tags = $(tag);
if ($tags.length > 0){
meta[tag] = {
total: 0
};
meta[tag].total = $tags.length;
meta[tag].urls = [];
$tags.each(function(){
var $el = $(this);
var href = $el.attr('href');
if (href){
meta[tag].urls.push(href.toLowerCase());
}
});
meta[tag].urls = _.uniq(meta[tag].urls);
}
return txt;
}
function removeTagBlock($, tag, txt, meta){
var $tags = $(tag);
if ($tags.length > 0){
meta[tag] = {
removed: 0,
total: 0
};
meta[tag].total = $tags.length;
$tags.each(function(){
var $el = $(this).text();
var index = txt.indexOf($el);
if (index > -1){
txt = txt.replace($el, '');
meta[tag].removed += 1;
}
});
}
return txt;
}
function analyze(url){
logger.info('requesting: ', url);
request({
"url": url,
"method": 'get',
"headers": {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36',
'accept': 'text/html,application/xhtml+xml'
}
}, function(err, res, body){
if(err || res.statusCode !== 200){
return logger.error(err);
}
//logger.trace('response body:', body);
var $ = cheerio.load(body);
var meta = {};
meta.tags = {};
meta.url = url;
meta.title = $("title").text();
var txt = $("body").text();
txt = removeTagBlock($, 'script', txt, meta.tags);
txt = removeTagBlock($, 'code', txt, meta.tags);
txt = removeTagBlock($, 'iframe', txt, meta.tags);
txt = links($, txt, meta.tags);
meta.text = txt.replace(/\s+/g,' ');
meta.tfidf = [];
tfidf.addDocument(txt);
tfidf.listTerms(0/*document index*/).forEach(function(item) {
meta.tfidf.push(item.term + ': ' + item.tfidf);
});
logger.info(JSON.stringify(meta, undefined, 2));
});
}
analyze(argv._[0]);