Skip to content
This repository has been archived by the owner on Mar 7, 2021. It is now read-only.

Skip faulty URLs such as those missing a hostname or having an invali… #387

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
44 changes: 43 additions & 1 deletion lib/crawler.js
Expand Up @@ -870,11 +870,24 @@ Crawler.prototype.cleanExpandResources = function (urlMatch, queueItem) {
continue;
}

// If we hit an empty item, don't return it
if (!URL.length) {
continue;
}

let parsedUrl = uri(URL);

// filter broken urls such as http:///
if (parsedUrl.protocol().length // has protocol
&& !parsedUrl.hostname().length // has no hostname
&& parsedUrl.path().length // has path
) {
continue;
}

if (!crawler.portValid(parsedUrl)) {
continue;
}

// If we don't support the protocol in question
if (!crawler.protocolSupported(URL)) {
continue;
Expand Down Expand Up @@ -997,6 +1010,35 @@ Crawler.prototype.domainValid = function(host) {
crawler.scanSubdomains && isSubdomainOf(host, crawler.host);
};

/**
* If the url contains a custom port, verify that it is a valid tcp port
*
* @param {String} URL
* @param {URI} parsedUrl uri.js instance
* @returns {boolean}
*/
Crawler.prototype.portValid = function(parsedUrl) {
"use strict";

// URL does not contain a custom port
if (parsedUrl.port().length === 0) {
return true;
}

let port = Number(parsedUrl.port());

if (!Number.isInteger(port)) {
return false;
}

// verify port is between 1 and 65535
if (port > 0 && port < 65536) {
return true;
}

return false;
};

/**
* Initiates discovery of linked resources in an HTML or text document, and
* queues the resources if applicable. Not to be confused with
Expand Down