Skip to content

Commit

Permalink
Fixed some case-folding and added Table A.1 for IDNA (#42).
Browse files Browse the repository at this point in the history
  • Loading branch information
ricmoo committed Aug 3, 2019
1 parent c09de16 commit f955dca
Show file tree
Hide file tree
Showing 10 changed files with 10,456 additions and 87 deletions.
118 changes: 69 additions & 49 deletions packages/strings/src.ts/idna.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,12 @@

import { toUtf8CodePoints, _toUtf8String, UnicodeNormalizationForm } from "./utf8";

let _tmp = 0;

type Ranged = {
l: number,
h: number,
d?: number,
s?: number,
e?: Array<number>
l: number, // Lo value
h: number, // High value (less the lo)
d?: number, // Delta/stride (default: 1)
s?: number, // Shift (default: 1)
e?: Array<number> // Exceptions to skip
};

type Table = { [ src: number ]: Array<number> };
Expand Down Expand Up @@ -40,6 +38,37 @@ function createTable(data: string, func?: (value: string) => Array<number>): Tab
return result;
}

function createRangeTable(data: string): Array<Ranged> {
let hi = 0;
return data.split(",").map((v) => {
let comps = v.split("-");
if (comps.length === 1) {
comps[1] = "0";
} else if (comps[1] === "") {
comps[1] = "1";
}

let lo = hi + parseInt(comps[0], 16);
hi = parseInt(comps[1], 16);
return { l: lo, h: hi };
});
}

function matchMap(value: number, ranges: Array<Ranged>): Ranged {
let lo = 0;
for (let i = 0; i < ranges.length; i++) {
let range = ranges[i];
lo += range.l;
if (value >= lo && value <= lo + range.h && ((value - lo) % (range.d || 1)) === 0) {
if (range.e && range.e.indexOf(value - lo) !== -1) { continue; }
return range;
}
}
return null;
}

const Table_A_1_ranges = createRangeTable("221,13-1b,5f-,40-10,51-f,11-3,3-3,2-2,2-4,8,2,15,2d,28-8,88,48,27-,3-5,11-20,27-,8,28,3-5,12,18,b-a,1c-4,6-16,2-d,2-2,2,1b-4,17-9,8f-,10,f,1f-2,1c-34,33-14e,4,36-,13-,6-2,1a-f,4,9-,3-,17,8,2-2,5-,2,8-,3-,4-8,2-3,3,6-,16-6,2-,7-3,3-,17,8,3,3,3-,2,6-3,3-,4-a,5,2-6,10-b,4,8,2,4,17,8,3,6-,b,4,4-,2-e,2-4,b-10,4,9-,3-,17,8,3-,5-,9-2,3-,4-7,3-3,3,4-3,c-10,3,7-2,4,5-2,3,2,3-2,3-2,4-2,9,4-3,6-2,4,5-8,2-e,d-d,4,9,4,18,b,6-3,8,4,5-6,3-8,3-3,b-11,3,9,4,18,b,6-3,8,4,5-6,3-6,2,3-3,b-11,3,9,4,18,11-3,7-,4,5-8,2-7,3-3,b-11,3,13-2,19,a,2-,8-2,2-3,7,2,9-11,4-b,3b-3,1e-24,3,2-,3,2-,2-5,5,8,4,2,2-,3,e,4-,6,2,7-,b-,3-21,49,23-5,1c-3,9,25,10-,2-2f,23,6,3,8-2,5-5,1b-45,27-9,2a-,2-3,5b-4,45-4,53-5,8,40,2,5-,8,2,5-,28,2,5-,20,2,5-,8,2,5-,8,8,18,20,2,5-,8,28,14-5,1d-22,56-b,277-8,1e-2,52-e,e,8-a,18-8,15-b,e,4,3-b,5e-2,b-15,10,b-5,59-7,2b-555,9d-3,5b-5,17-,7-,27-,7-,9,2,2,2,20-,36,10,f-,7,14-,4,a,54-3,2-6,6-5,9-,1c-10,13-1d,1c-14,3c-,10-6,32-b,240-30,28-18,c-14,a0,115-,3,66-,b-76,5,5-,1d,24,2,5-2,2,8-,35-2,19,f-10,1d-3,311-37f,1b,5a-b,d7-19,d-3,41,57-,68-4,29-3,5f,29-37,2e-2,25-c,2c-2,4e-3,30,78-3,64-,20,19b7-49,51a7-59,48e-2,38-738,2ba5-5b,222f-,3c-94,8-b,6-4,1b,6,2,3,3,6d-20,16e-f,41-,37-7,2e-2,11-f,5-b,18-,b,14,5-3,6,88-,2,bf-2,7-,7-,7-,4-2,8,8-9,8-2ff,20,5-b,1c-b4,27-,27-cbb1,f7-9,28-2,b5-221,56,48,3-,2-,3-,5,d,2,5,3,42,5-,9,8,1d,5,6,2-2,8,153-3,123-3,33-27fd,a6da-5128,21f-5df,3-fffd,3-fffd,3-fffd,3-fffd,3-fffd,3-fffd,3-fffd,3-fffd,3-fffd,3-fffd,3-fffd,3,2-1d,61-ff7d");

// @TODO: Make this relative...
const Table_B_1_flags = "ad,34f,1806,180b,180c,180d,200b,200c,200d,2060,feff".split(",").map((v) => parseInt(v, 16));

Expand Down Expand Up @@ -91,31 +120,8 @@ const Table_B_2_lut_abs = createTable("b5:3bc,c3:ff,7:73,2:253,5:254,3:256,1:257
const Table_B_2_lut_rel = createTable("179:1,2:1,2:1,5:1,2:1,a:4f,a:1,8:1,2:1,2:1,3:1,5:1,3:1,4:1,2:1,3:1,4:1,8:2,1:1,2:2,1:1,2:2,27:2,195:26,2:25,1:25,1:25,2:40,2:3f,1:3f,33:1,11:-6,1:-9,1ac7:-3a,6d:-8,1:-8,1:-8,1:-8,1:-8,1:-8,1:-8,1:-8,9:-8,1:-8,1:-8,1:-8,1:-8,1:-8,b:-8,1:-8,1:-8,1:-8,1:-8,1:-8,1:-8,1:-8,9:-8,1:-8,1:-8,1:-8,1:-8,1:-8,1:-8,1:-8,9:-8,1:-8,1:-8,1:-8,1:-8,1:-8,c:-8,2:-8,2:-8,2:-8,9:-8,1:-8,1:-8,1:-8,1:-8,1:-8,1:-8,1:-8,49:-8,1:-8,1:-4a,1:-4a,d:-56,1:-56,1:-56,1:-56,d:-8,1:-8,f:-8,1:-8,3:-7");
const Table_B_2_complex = createTable("df:00730073,51:00690307,19:02BC006E,a7:006A030C,18a:002003B9,16:03B903080301,20:03C503080301,1d7:05650582,190f:00680331,1:00740308,1:0077030A,1:0079030A,1:006102BE,b6:03C50313,2:03C503130300,2:03C503130301,2:03C503130342,2a:1F0003B9,1:1F0103B9,1:1F0203B9,1:1F0303B9,1:1F0403B9,1:1F0503B9,1:1F0603B9,1:1F0703B9,1:1F0003B9,1:1F0103B9,1:1F0203B9,1:1F0303B9,1:1F0403B9,1:1F0503B9,1:1F0603B9,1:1F0703B9,1:1F2003B9,1:1F2103B9,1:1F2203B9,1:1F2303B9,1:1F2403B9,1:1F2503B9,1:1F2603B9,1:1F2703B9,1:1F2003B9,1:1F2103B9,1:1F2203B9,1:1F2303B9,1:1F2403B9,1:1F2503B9,1:1F2603B9,1:1F2703B9,1:1F6003B9,1:1F6103B9,1:1F6203B9,1:1F6303B9,1:1F6403B9,1:1F6503B9,1:1F6603B9,1:1F6703B9,1:1F6003B9,1:1F6103B9,1:1F6203B9,1:1F6303B9,1:1F6403B9,1:1F6503B9,1:1F6603B9,1:1F6703B9,3:1F7003B9,1:03B103B9,1:03AC03B9,2:03B10342,1:03B1034203B9,5:03B103B9,6:1F7403B9,1:03B703B9,1:03AE03B9,2:03B70342,1:03B7034203B9,5:03B703B9,6:03B903080300,1:03B903080301,3:03B90342,1:03B903080342,b:03C503080300,1:03C503080301,1:03C10313,2:03C50342,1:03C503080342,b:1F7C03B9,1:03C903B9,1:03CE03B9,2:03C90342,1:03C9034203B9,5:03C903B9,ac:00720073,5b:00B00063,6:00B00066,d:006E006F,a:0073006D,1:00740065006C,1:0074006D,124f:006800700061,2:00610075,2:006F0076,b:00700061,1:006E0061,1:03BC0061,1:006D0061,1:006B0061,1:006B0062,1:006D0062,1:00670062,3:00700066,1:006E0066,1:03BC0066,4:0068007A,1:006B0068007A,1:006D0068007A,1:00670068007A,1:00740068007A,15:00700061,1:006B00700061,1:006D00700061,1:006700700061,8:00700076,1:006E0076,1:03BC0076,1:006D0076,1:006B0076,1:006D0076,1:00700077,1:006E0077,1:03BC0077,1:006D0077,1:006B0077,1:006D0077,1:006B03C9,1:006D03C9,2:00620071,3:00632215006B0067,1:0063006F002E,1:00640062,1:00670079,2:00680070,2:006B006B,1:006B006D,9:00700068,2:00700070006D,1:00700072,2:00730076,1:00770062,c723:00660066,1:00660069,1:0066006C,1:006600660069,1:00660066006C,1:00730074,1:00730074,d:05740576,1:05740565,1:0574056B,1:057E0576,1:0574056D", bytes2);

_tmp = 0;
const Table_C_flags = "70f,f71,18e".split(",").map((v) => {
_tmp += parseInt(v, 16);
return _tmp;
});
_tmp = 0;
const Table_C_ranges = "80-20,2c0,1cc0-f,28-7,37-4,b-5,f86-b,a810-20ff,25d0-1f,229-6,d17a-7,2e8b,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,22-5f".split(",").map((v) => {
let comps = v.split("-");
if (comps.length === 1) { comps[1] = "1"; }
_tmp += parseInt(comps[0], 16);
return { l: _tmp, h: parseInt(comps[1], 16) }
});
const Table_C_ranges = createRangeTable("80-20,2a0-,39c,32,f71,18e,7f2-f,19-7,30-4,7-5,f81-b,5,a800-20ff,4d1-1f,110,fa-6,d174-7,2e84-,ffff-,ffff-,ffff-,ffff-,ffff-,ffff-,ffff-,ffff-,ffff-,ffff-,ffff-,ffff-,2,1f-5f,ff7f-20001");

function matchMap(value: number, ranges: Array<Ranged>): Ranged {
let lo = 0;
for (let i = 0; i < ranges.length; i++) {
let range = ranges[i];
lo += range.l;
if (value >= lo && value <= lo + range.h && ((value - lo) % (range.d || 1)) === 0) {
if (range.e && range.e.indexOf(value - lo) !== -1) { continue; }
return range;
}
}
return null;
}

function flatten(values: Array<Array<number>>): Array<number> {
return values.reduce((accum, value) => {
Expand All @@ -124,28 +130,36 @@ function flatten(values: Array<Array<number>>): Array<number> {
}, [ ]);
}

export function _nameprepTableA1(codepoint: number): boolean {
return !!matchMap(codepoint, Table_A_1_ranges);
}

export function _nameprepTableB2(codepoint: number): Array<number> {
let match = matchMap(codepoint, Table_B_2_ranges);
if (match) { return [ codepoint + match.s ]; }
let range = matchMap(codepoint, Table_B_2_ranges);
if (range) { return [ codepoint + range.s ]; }

let codes = Table_B_2_lut_abs[codepoint];
if (codes) { return codes; }

let codes = Table_B_2_lut_abs[codepoint];
if (codes) { return codes; }
let shift = Table_B_2_lut_rel[codepoint];
if (shift) { return [ codepoint + shift[0] ]; }

let shift = Table_B_2_lut_rel[codepoint];
if (shift) { return [ codepoint + shift[0] ]; }
let complex = Table_B_2_complex[codepoint];
if (complex) { return complex; }

let complex = Table_B_2_complex[codepoint];
if (complex) { return complex; }
return null;
}

return null;
export function _nameprepTableC(codepoint: number): boolean {
return !!matchMap(codepoint, Table_C_ranges);
}

export function nameprep(value: string): string {

// This allows platforms with incomplete normalize to bypass
// it for very basic names which the built-in toLowerCase
// will certainly handle correctly
if (value.match(/^[a-z0-9-]*$/i)) { return value.toLowerCase(); }
if (value.match(/^[a-z0-9-]*$/i) && value.length <= 59) { return value.toLowerCase(); }

// Get the code points (keeping the current normalization)
let codes = toUtf8CodePoints(value);
Expand All @@ -163,17 +177,21 @@ export function nameprep(value: string): string {
return [ code ];
}));

// Normalize using fomr KC
// Normalize using form KC
codes = toUtf8CodePoints(_toUtf8String(codes), UnicodeNormalizationForm.NFKC);

// Prohibit C.1.2, C.2.2, C.3, C.4, C.5, C.6, C.7, C.8, C.9
// Prohibit Tables C.1.2, C.2.2, C.3, C.4, C.5, C.6, C.7, C.8, C.9
codes.forEach((code) => {
if (Table_C_flags.indexOf(code) >= 0) { throw new Error("invalid character code"); }
Table_C_ranges.forEach((range) => {
if (code >= range.l && code <= range.l + range.h) {
throw new Error("STRINGPREP_CONTAINS_PROHIBITED");
}
});
if (_nameprepTableC(code)) {
throw new Error("STRINGPREP_CONTAINS_PROHIBITED");
}
});

// Prohibit Unassigned Code Points (Table A.1)
codes.forEach((code) => {
if (_nameprepTableA1(code)) {
throw new Error("STRINGPREP_CONTAINS_UNASSIGNED");
}
});

// IDNA extras
Expand All @@ -187,6 +205,8 @@ export function nameprep(value: string): string {
// IDNA: 4.2.4
if (name.length > 63) { throw new Error("too long"); }



return name;
}

46 changes: 46 additions & 0 deletions packages/testcases/input/nameprep/extract-tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import json
import re

output = ""
for line in file("test-vectors-00.txt"):
line = line.strip()
if line == "" or line[0:1] == "#":
continue
if line.startswith("Josefsson") or line.startswith("Internet-Draft"):
continue
output += line.replace("\n", "")

Tests = [ ]

def get_byte(v):
if len(v) == 1:
return ord(v)
return int(v[2:4], 16)

def get_string(value):
value = value.strip()
if value[0] == '"' and value[-1] == '"':
return map(get_byte, re.findall("(\\\\x[0-9a-fA-F]{2}|.)", value[1:-1].replace('""', '')))
if value.lower() == "null":
return None
raise Exception("unhandled")

Tests = [ ]

matches = re.findall("({(?:.|\n)*?})", output)
for m in matches:
comps = m[1:-1].split(",")
test = dict(
comment = comps[0].strip()[1:-1],
input = get_string(comps[1]),
output = get_string(comps[2])
)
if len(comps) >= 4:
test["profile"] = get_string(comps[3])
if len(comps) >= 5:
test["flags"] = comps[4].strip()
if len(comps) >= 6:
test["rc"] = comps[5].strip()
Tests.append(test)

print json.dumps(Tests)
19 changes: 15 additions & 4 deletions packages/testcases/input/nameprep/generate-b2.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,17 +149,28 @@ def add_simple_data(data):
mappings.append(data)
debug[data["l"]] = "MAP:" + str(data)

# Create complex table (things that map to more than one byte)
complex = { }
complex_output = [ ];
for (src, dst, reason) in weird:
for word in dst.split(" "):
complex_output.append(int(word, 16))
if len(word) != 4: raise Exception("hmmm")
complex[int(src, 16)] = dst.replace(" ", "")

# Experimenting: We can easily create a LUT for the individual
# components, as there is substantial overlap.
#complex_output = dict((x, True) for x in complex_output).keys()
#complex_output.sort()
#print "COM", complex_output, len(complex_output)

# Sort mappings by lo
mappings.sort(lambda a, b: cmp(a["l"], b["l"]))

debug_keys = debug.keys()
debug_keys.sort()
for d in debug_keys:
print d, debug[d]
#debug_keys = debug.keys()
#debug_keys.sort()
#for d in debug_keys:
# print d, debug[d]

#print mappings

Expand Down
56 changes: 22 additions & 34 deletions packages/testcases/input/nameprep/generate-c.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
def hexify(v):
return hex(v)[2:]

prohibit = [ ]

table = None
Expand Down Expand Up @@ -28,42 +31,27 @@
prohibit = list(dict([(p, True) for p in prohibit]).keys())
prohibit.sort()

prohibit_single = [ ]
prohibit_range = [ ]
output = [ dict(lo = prohibit[0], hi = prohibit[0]) ]

last_range_start = None
last = 0
for p in prohibit:
if p - 1 == last:
if last_range_start is None:
last_range_start = last
if len(prohibit_single) > 0 and prohibit_single[-1] == last:
prohibit_single.pop()
for p in prohibit[1:]:
if p - 1 == output[-1]["hi"]:
output[-1]["hi"] = p
else:
if last_range_start is not None:
print "Range", last_range_start, last - last_range_start, hex(last_range_start)
length = last - last_range_start
if length == 1:
length = ""
else:
length = "-" + hex(length)[2:]
prohibit_range.append([ last_range_start, length ])
last_range_start = None
else:
print "Single", p, hex(p)
prohibit_single.append(p)
last = p
output.append(dict(lo = p, hi = p))

last = 0
for i in xrange(0, len(prohibit_single)):
v = prohibit_single[i]
prohibit_single[i] -= last
last = v
print 'const Table_C_lut = "' + ",".join(hex(x)[2:] for x in prohibit_single) + '";'
print output

last = 0
for item in prohibit_range:
v = item[0]
item[0] -= last
last = v
print 'const Table_C_ranges = "' + ",".join(("%s%s" % (hex(p[0])[2:], p[1])) for p in prohibit_range) + '";';
for r in output:
r["h"] = r["hi"] - r["lo"]
r["l"] = r["lo"] - last
last = r["hi"]

r["range"] = hexify(r["l"])
if r["h"] > 1:
r["range"] += "-" + hexify(r["h"])
elif r["h"] > 0:
r["range"] += "-"

print 'const Table_C_ranges = "' + ",".join(x["range"] for x in output) + '";'

0 comments on commit f955dca

Please sign in to comment.