Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Unicode international characters #151

Merged
merged 15 commits into from
Feb 15, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
147 changes: 103 additions & 44 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// Load modules

const Dns = require('dns');

const Punycode = require('punycode');

// Declare internals

Expand Down Expand Up @@ -127,12 +127,27 @@ internals.specials = function () {

const specials = '()<>[]:;@\\,."'; // US-ASCII visible characters not valid for atext (http://tools.ietf.org/html/rfc5322#section-3.2.3)
const lookup = new Array(0x100);
for (let i = 0xff; i >= 0; --i) {
lookup[i] = false;
}
lookup.fill(false);

for (let i = 0; i < specials.length; ++i) {
lookup[specials.charCodeAt(i)] = true;
lookup[specials.codePointAt(i)] = true;
}

return function (code) {

return lookup[code];
};
}();

internals.c0Controls = function () {

const lookup = new Array(0x100);
lookup.fill(false);

// add C0 control characters

for (let i = 0; i < 33; ++i) {
lookup[i] = true;
}

return function (code) {
Expand All @@ -141,12 +156,41 @@ internals.specials = function () {
};
}();

internals.c1Controls = function () {

const lookup = new Array(0x100);
lookup.fill(false);

// add C1 control characters

for (let i = 127; i < 160; ++i) {
lookup[i] = true;
}

return function (code) {

return lookup[code];
};
}();

internals.regex = {
ipV4: /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)$/,
ipV6: /^[a-fA-F\d]{0,4}$/
};

// $lab:coverage:off$
internals.nulNormalize = function (email) {

let emailPieces = email.split('\u0000');
emailPieces = emailPieces.map((string) => {

return string.normalize('NFC');
});

return emailPieces.join('\u0000');
};
// $lab:coverage:on$


internals.checkIpV6 = function (items) {

Expand All @@ -173,7 +217,7 @@ internals.validDomain = function (tldAtom, options) {


/**
* Check that an email address conforms to RFCs 5321, 5322 and others
* Check that an email address conforms to RFCs 5321, 5322, 6530 and others
*
* We distinguish clearly between a Mailbox as defined by RFC 5321 and an
* addr-spec as defined by RFC 5322. Depending on the context, either can be
Expand All @@ -197,6 +241,7 @@ internals.validDomain = function (tldAtom, options) {
exports.validate = internals.validate = function (email, options, callback) {

options = options || {};
email = internals.normalize(email);

if (typeof options === 'function') {
callback = options;
Expand Down Expand Up @@ -281,8 +326,9 @@ exports.validate = internals.validate = function (email, options, callback) {
const emailLength = email.length;

let token; // Token is used outside the loop, must declare similarly
for (let i = 0; i < emailLength; ++i) {
token = email[i];
for (let i = 0; i < emailLength; i += token.length) {
// Utilize codepoints to account for Unicode surrogate pairs
token = String.fromCodePoint(email.codePointAt(i));

switch (context.now) {
// Local-part
Expand Down Expand Up @@ -350,7 +396,7 @@ exports.validate = internals.validate = function (email, options, callback) {

parseData.local += token;
atomData.locals[elementCount] += token;
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');

// Quoted string must be the entire element
assertEnd = true;
Expand Down Expand Up @@ -406,7 +452,7 @@ exports.validate = internals.validate = function (email, options, callback) {
}
// http://tools.ietf.org/html/rfc5321#section-4.5.3.1.1 the maximum total length of a user name or other local-part is 64
// octets
else if (parseData.local.length > 64) {
else if (Buffer.byteLength(parseData.local, 'utf8') > 64) {
updateResult(internals.diagnoses.rfc5322LocalTooLong);
}
// http://tools.ietf.org/html/rfc5322#section-3.4.1 comments and folding white space SHOULD NOT be used around "@" in the
Expand Down Expand Up @@ -462,18 +508,18 @@ exports.validate = internals.validate = function (email, options, callback) {
}
else {
context.prev = context.now;
charCode = token.charCodeAt(0);
charCode = token.codePointAt(0);

// Especially if charCode == 10
if (charCode < 33 || charCode > 126 || internals.specials(charCode)) {
if (internals.specials(charCode) || internals.c0Controls(charCode) || internals.c1Controls(charCode)) {

// Fatal error
updateResult(internals.diagnoses.errExpectingATEXT);
}

parseData.local += token;
atomData.locals[elementCount] += token;
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');
}
}

Expand Down Expand Up @@ -576,7 +622,7 @@ exports.validate = internals.validate = function (email, options, callback) {
if (parseData.domain.length === 0) {
// Domain literal must be the only component
assertEnd = true;
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');
context.stack.push(context.now);
context.now = internals.components.literal;
parseData.domain += token;
Expand Down Expand Up @@ -660,11 +706,11 @@ exports.validate = internals.validate = function (email, options, callback) {
}
}

charCode = token.charCodeAt(0);
charCode = token.codePointAt(0);
// Assume this token isn't a hyphen unless we discover it is
hyphenFlag = false;

if (charCode < 33 || charCode > 126 || internals.specials(charCode)) {
if (internals.specials(charCode) || internals.c0Controls(charCode) || internals.c1Controls(charCode)) {
// Fatal error
updateResult(internals.diagnoses.errExpectingATEXT);
}
Expand All @@ -676,15 +722,15 @@ exports.validate = internals.validate = function (email, options, callback) {

hyphenFlag = true;
}
// Check if it's a neither a number nor a latin letter
else if (charCode < 48 || charCode > 122 || (charCode > 57 && charCode < 65) || (charCode > 90 && charCode < 97)) {
// Check if it's a neither a number nor a latin/unicode letter
else if (charCode < 48 || (charCode > 122 && charCode < 192) || (charCode > 57 && charCode < 65) || (charCode > 90 && charCode < 97)) {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WesTyler I'm reading through some of this code as I refactor, and now I find myself asking where 192 came from. You wouldn't happen to recall, would you?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I believe 192 is the beginning of the Unicode "international" character set (À).

If I remember correctly, Unicode #s 123-191 are all non-character symbols.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh, ok. Thanks!

// This is not an RFC 5321 subdomain, but still OK by RFC 5322
updateResult(internals.diagnoses.rfc5322Domain);
}

parseData.domain += token;
atomData.domains[elementCount] += token;
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');
}

break;
Expand Down Expand Up @@ -821,7 +867,7 @@ exports.validate = internals.validate = function (email, options, callback) {

parseData.domain += token;
atomData.domains[elementCount] += token;
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');
context.prev = context.now;
context.now = context.stack.pop();
break;
Expand Down Expand Up @@ -864,22 +910,22 @@ exports.validate = internals.validate = function (email, options, callback) {
// %d12 / ; include the carriage
// %d14-31 / ; return, line feed, and
// %d127 ; white space characters
charCode = token.charCodeAt(0);
charCode = token.codePointAt(0);

// '\r', '\n', ' ', and '\t' have already been parsed above
if (charCode > 127 || charCode === 0 || token === '[') {
if ((charCode !== 127 && internals.c1Controls(charCode)) || charCode === 0 || token === '[') {
// Fatal error
updateResult(internals.diagnoses.errExpectingDTEXT);
break;
}
else if (charCode < 33 || charCode === 127) {
else if (internals.c0Controls(charCode) || charCode === 127) {
updateResult(internals.diagnoses.rfc5322DomainLiteralOBSDText);
}

parseData.literal += token;
parseData.domain += token;
atomData.domains[elementCount] += token;
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');
}

break;
Expand Down Expand Up @@ -922,7 +968,7 @@ exports.validate = internals.validate = function (email, options, callback) {

parseData.local += ' ';
atomData.locals[elementCount] += ' ';
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');

updateResult(internals.diagnoses.cfwsFWS);
context.stack.push(context.now);
Expand All @@ -934,7 +980,7 @@ exports.validate = internals.validate = function (email, options, callback) {
case '"':
parseData.local += token;
atomData.locals[elementCount] += token;
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');
context.prev = context.now;
context.now = context.stack.pop();
break;
Expand All @@ -954,18 +1000,18 @@ exports.validate = internals.validate = function (email, options, callback) {
// %d12 / ; include the carriage
// %d14-31 / ; return, line feed, and
// %d127 ; white space characters
charCode = token.charCodeAt(0);
charCode = token.codePointAt(0);

if (charCode > 127 || charCode === 0 || charCode === 10) {
if ((charCode !== 127 && internals.c1Controls(charCode)) || charCode === 0 || charCode === 10) {
updateResult(internals.diagnoses.errExpectingQTEXT);
}
else if (charCode < 32 || charCode === 127) {
else if (internals.c0Controls(charCode) || charCode === 127) {
updateResult(internals.diagnoses.deprecatedQTEXT);
}

parseData.local += token;
atomData.locals[elementCount] += token;
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');
}

// http://tools.ietf.org/html/rfc5322#section-3.4.1
Expand All @@ -992,9 +1038,9 @@ exports.validate = internals.validate = function (email, options, callback) {
// %d127 ; white space characters
//
// i.e. obs-qp = "\" (%d0-8, %d10-31 / %d127)
charCode = token.charCodeAt(0);
charCode = token.codePointAt(0);

if (charCode > 127) {
if (charCode !== 127 && internals.c1Controls(charCode)) {
// Fatal error
updateResult(internals.diagnoses.errExpectingQPair);
}
Expand All @@ -1010,23 +1056,23 @@ exports.validate = internals.validate = function (email, options, callback) {
context.prev = context.now;
// End of qpair
context.now = context.stack.pop();
token = '\\' + token;
const escapeToken = '\\' + token;

switch (context.now) {
case internals.components.contextComment:
break;

case internals.components.contextQuotedString:
parseData.local += token;
atomData.locals[elementCount] += token;
parseData.local += escapeToken;
atomData.locals[elementCount] += escapeToken;

// The maximum sizes specified by RFC 5321 are octet counts, so we must include the backslash
elementLength += 2;
break;

case internals.components.literal:
parseData.domain += token;
atomData.domains[elementCount] += token;
parseData.domain += escapeToken;
atomData.domains[elementCount] += escapeToken;

// The maximum sizes specified by RFC 5321 are octet counts, so we must include the backslash
elementLength += 2;
Expand Down Expand Up @@ -1099,14 +1145,14 @@ exports.validate = internals.validate = function (email, options, callback) {
// %d12 / ; include the carriage
// %d14-31 / ; return, line feed, and
// %d127 ; white space characters
charCode = token.charCodeAt(0);
charCode = token.codePointAt(0);

if (charCode > 127 || charCode === 0 || charCode === 10) {
if (charCode === 0 || charCode === 10 || (charCode !== 127 && internals.c1Controls(charCode))) {
// Fatal error
updateResult(internals.diagnoses.errExpectingCTEXT);
break;
}
else if (charCode < 32 || charCode === 127) {
else if (internals.c0Controls(charCode) || charCode === 127) {
updateResult(internals.diagnoses.deprecatedCTEXT);
}
}
Expand Down Expand Up @@ -1219,12 +1265,12 @@ exports.validate = internals.validate = function (email, options, callback) {
}

// Other errors
else if (parseData.domain.length > 255) {
else if (Buffer.byteLength(parseData.domain, 'utf8') > 255) {
// http://tools.ietf.org/html/rfc5321#section-4.5.3.1.2
// The maximum total length of a domain name or number is 255 octets.
updateResult(internals.diagnoses.rfc5322DomainTooLong);
}
else if (parseData.local.length + parseData.domain.length + /* '@' */ 1 > 254) {
else if (Buffer.byteLength(parseData.local, 'utf8') + Buffer.byteLength(parseData.domain, 'utf8') + /* '@' */ 1 > 254) {
// http://tools.ietf.org/html/rfc5321#section-4.1.2
// Forward-path = Path
//
Expand Down Expand Up @@ -1266,7 +1312,7 @@ exports.validate = internals.validate = function (email, options, callback) {

if (!dnsPositive && maxResult < internals.categories.dnsWarn) {
// Per RFC 5321, domain atoms are limited to letter-digit-hyphen, so we only need to check code <= 57 to check for a digit
const code = atomData.domains[elementCount].charCodeAt(0);
const code = atomData.domains[elementCount].codePointAt(0);
if (code <= 57) {
updateResult(internals.diagnoses.rfc5321TLDNumeric);
}
Expand Down Expand Up @@ -1311,7 +1357,7 @@ exports.validate = internals.validate = function (email, options, callback) {
parseData.domain += '.';
}

const dnsDomain = parseData.domain;
const dnsDomain = Punycode.toASCII(parseData.domain);
Dns.resolveMx(dnsDomain, (err, mxRecords) => {

// If we have a fatal error, then we must assume that there are no records
Expand Down Expand Up @@ -1376,3 +1422,16 @@ exports.diagnoses = internals.validate.diagnoses = (function () {

return diag;
})();


exports.normalize = internals.normalize = function (email) {

// $lab:coverage:off$
if (process.version[1] === '4' && email.indexOf('\u0000') >= 0) {
return internals.nulNormalize(email);
}
// $lab:coverage:on$


return email.normalize('NFC');
};
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@
"node": ">=4.0.0"
},
"dependencies": {
"punycode": "2.1.x"
},
"devDependencies": {
"code": "3.x.x",
"lab": "10.x.x"
"lab": "10.x.x",
"proxyquire": "1.x.x"
},
"scripts": {
"test": "lab -a code -t 100 -L -m 5000",
Expand Down