Skip to content
This repository has been archived by the owner on Mar 7, 2021. It is now read-only.

Commit

Permalink
Merge pull request #382 from elxa/fix-issue-364
Browse files Browse the repository at this point in the history
Performance optimization for very large sitemaps.
  • Loading branch information
fredrikekelund committed Jul 25, 2017
2 parents 750511d + 1611bd6 commit d7af4f1
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 47 deletions.
3 changes: 2 additions & 1 deletion .eslintrc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
"root": true,

"env": {
"node": true
"node": true,
"es6": true
},

"rules": {
Expand Down
2 changes: 0 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ sudo: false
language: node_js

node_js:
- "0.10"
- "0.12"
- "4"
- "5"
- "6"
Expand Down
6 changes: 1 addition & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -884,14 +884,10 @@ list below before submitting an issue.
## Node Support Policy

Simplecrawler will officially support stable and LTS versions of Node which are
currently supported by the Node Foundation. We will endeavour to continue to
support Node 0.10.x — but now that it has fallen out of LTS it is likely we will
adopt newer JS syntax and APIs which 0.10.x does not support.
currently supported by the Node Foundation.

Currently supported versions:

- 0.10.x
- 0.12.x
- 4.x
- 5.x
- 6.x
Expand Down
4 changes: 0 additions & 4 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ version: "{build}"

environment:
matrix:
- nodejs_version: "0.10"
platform: x86
- nodejs_version: "0.12"
platform: x86
- nodejs_version: "4"
platform: x64
- nodejs_version: "4"
Expand Down
67 changes: 32 additions & 35 deletions lib/crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -832,52 +832,49 @@ function cleanURL (URL, queueItem) {
* @param {QueueItem} queueItem The queue item representing the resource where the URL's were discovered
* @return {Array} Returns an array of unique and absolute URL's
*/
Crawler.prototype.cleanExpandResources = function(urlMatch, queueItem) {
Crawler.prototype.cleanExpandResources = function (urlMatch, queueItem) {
"use strict";
var crawler = this;

if (!urlMatch) {
return [];
}
const URLs = new Set();
let URL;
for (let i = 0; i < urlMatch.length; i++) {
URL = urlMatch[i];

return urlMatch
.filter(Boolean)
.map(function(url) {
return cleanURL(url, queueItem);
})
.reduce(function(list, URL) {

// Ensure URL is whole and complete
try {
URL = uri(URL)
.absoluteTo(queueItem.url || "")
.normalize()
.href();
} catch (e) {
// But if URI.js couldn't parse it - nobody can!
return list;
}
if (!URL) {
continue;
}

// If we hit an empty item, don't return it
if (!URL.length) {
return list;
}
URL = cleanURL(URL, queueItem);

// Ensure URL is whole and complete
try {
URL = uri(URL)
.absoluteTo(queueItem.url || "")
.normalize()
.href();
} catch (e) {
// But if URI.js couldn't parse it - nobody can!
continue;
}

// If we don't support the protocol in question
if (!crawler.protocolSupported(URL)) {
return list;
}
// If we hit an empty item, don't return it
if (!URL.length) {
continue;
}

// Does the item already exist in the list?
var exists = list.some(function(entry) {
return entry === URL;
});
// If we don't support the protocol in question
if (!crawler.protocolSupported(URL)) {
continue;
}

if (exists) {
return list;
}
URLs.add(URL);
}

return list.concat(URL);
}, []);
return Array.from(URLs);
};

/**
Expand Down

0 comments on commit d7af4f1

Please sign in to comment.