Merge pull request #382 from elxa/fix-issue-364

Performance optimization for very large sitemaps.
simplecrawler · Jul 25, 2017 · d7af4f1 · d7af4f1
2 parents 750511d + 1611bd6
commit d7af4f1
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 47 deletions.
diff --git a/.eslintrc b/.eslintrc
@@ -2,7 +2,8 @@
     "root": true,
 
     "env": {
-        "node": true
+        "node": true,
+        "es6": true
     },
 
     "rules": {

diff --git a/.travis.yml b/.travis.yml
@@ -3,8 +3,6 @@ sudo: false
 language: node_js
 
 node_js:
-    - "0.10"
-    - "0.12"
     - "4"
     - "5"
     - "6"

diff --git a/README.md b/README.md
@@ -884,14 +884,10 @@ list below before submitting an issue.
 ## Node Support Policy
 
 Simplecrawler will officially support stable and LTS versions of Node which are
-currently supported by the Node Foundation. We will endeavour to continue to
-support Node 0.10.x — but now that it has fallen out of LTS it is likely we will
-adopt newer JS syntax and APIs which 0.10.x does not support.
+currently supported by the Node Foundation.
 
 Currently supported versions:
 
-- 0.10.x
-- 0.12.x
 - 4.x
 - 5.x
 - 6.x

diff --git a/appveyor.yml b/appveyor.yml
@@ -6,10 +6,6 @@ version: "{build}"
 
 environment:
     matrix:
-        - nodejs_version: "0.10"
-          platform: x86
-        - nodejs_version: "0.12"
-          platform: x86
         - nodejs_version: "4"
           platform: x64
         - nodejs_version: "4"

diff --git a/lib/crawler.js b/lib/crawler.js
@@ -832,52 +832,49 @@ function cleanURL (URL, queueItem) {
  * @param  {QueueItem} queueItem The queue item representing the resource where the URL's were discovered
  * @return {Array}               Returns an array of unique and absolute URL's
  */
-Crawler.prototype.cleanExpandResources = function(urlMatch, queueItem) {
+Crawler.prototype.cleanExpandResources = function (urlMatch, queueItem) {
+    "use strict";
     var crawler = this;
 
     if (!urlMatch) {
         return [];
     }
+    const URLs = new Set();
+    let URL;
+    for (let i = 0; i < urlMatch.length; i++) {
+        URL = urlMatch[i];
 
-    return urlMatch
-        .filter(Boolean)
-        .map(function(url) {
-            return cleanURL(url, queueItem);
-        })
-        .reduce(function(list, URL) {
-
-            // Ensure URL is whole and complete
-            try {
-                URL = uri(URL)
-                    .absoluteTo(queueItem.url || "")
-                    .normalize()
-                    .href();
-            } catch (e) {
-                // But if URI.js couldn't parse it - nobody can!
-                return list;
-            }
+        if (!URL) {
+            continue;
+        }
 
-            // If we hit an empty item, don't return it
-            if (!URL.length) {
-                return list;
-            }
+        URL = cleanURL(URL, queueItem);
+
+        // Ensure URL is whole and complete
+        try {
+            URL = uri(URL)
+                .absoluteTo(queueItem.url || "")
+                .normalize()
+                .href();
+        } catch (e) {
+            // But if URI.js couldn't parse it - nobody can!
+            continue;
+        }
 
-            // If we don't support the protocol in question
-            if (!crawler.protocolSupported(URL)) {
-                return list;
-            }
+        // If we hit an empty item, don't return it
+        if (!URL.length) {
+            continue;
+        }
 
-            // Does the item already exist in the list?
-            var exists = list.some(function(entry) {
-                return entry === URL;
-            });
+        // If we don't support the protocol in question
+        if (!crawler.protocolSupported(URL)) {
+            continue;
+        }
 
-            if (exists) {
-                return list;
-            }
+        URLs.add(URL);
+    }
 
-            return list.concat(URL);
-        }, []);
+    return Array.from(URLs);
 };
 
 /**