Added try/catch block around URL parsing in Crawler#getRobotsTxt redi…

…rection logic More in #363
simplecrawler · Jul 25, 2017 · 9f4e4ec · 9f4e4ec
1 parent d7af4f1
commit 9f4e4ec
Showing 1 changed file with 12 additions and 3 deletions.
diff --git a/lib/crawler.js b/lib/crawler.js
@@ -650,9 +650,18 @@ Crawler.prototype.getRobotsTxt = function(url, callback) {
 
             response.destroy();
 
-            var redirectTarget = uri(response.headers.location)
-                .absoluteTo(robotsTxtUrl)
-                .normalize();
+            var redirectTarget;
+
+            try {
+                redirectTarget = uri(response.headers.location)
+                    .absoluteTo(robotsTxtUrl)
+                    .normalize();
+            } catch (error) {
+                var robotsTxtHost = uri(robotsTxtUrl).pathname("").href();
+                errorMsg = util.format("Faulty redirect URL when fetching robots.txt for %s", robotsTxtHost);
+
+                return callback(new Error(errorMsg));
+            }
 
             if (crawler.domainValid(redirectTarget.hostname())) {
                 crawler.getRobotsTxt(redirectTarget.href(), callback);