From 9f4e4ecf4977e13fc1e7518bb4ea1ca3470aee02 Mon Sep 17 00:00:00 2001 From: Fredrik Ekelund Date: Tue, 25 Jul 2017 18:29:05 +0200 Subject: [PATCH] Added try/catch block around URL parsing in Crawler#getRobotsTxt redirection logic More in #363 --- lib/crawler.js | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/lib/crawler.js b/lib/crawler.js index 19f9adb..94f5cbc 100644 --- a/lib/crawler.js +++ b/lib/crawler.js @@ -650,9 +650,18 @@ Crawler.prototype.getRobotsTxt = function(url, callback) { response.destroy(); - var redirectTarget = uri(response.headers.location) - .absoluteTo(robotsTxtUrl) - .normalize(); + var redirectTarget; + + try { + redirectTarget = uri(response.headers.location) + .absoluteTo(robotsTxtUrl) + .normalize(); + } catch (error) { + var robotsTxtHost = uri(robotsTxtUrl).pathname("").href(); + errorMsg = util.format("Faulty redirect URL when fetching robots.txt for %s", robotsTxtHost); + + return callback(new Error(errorMsg)); + } if (crawler.domainValid(redirectTarget.hostname())) { crawler.getRobotsTxt(redirectTarget.href(), callback);