Merge pull request #353 from hotoo/feat/segment-intl.segmenter

Feat/segment intl.segmenter
hotoo · Nov 12, 2023 · deb1b3c · deb1b3c
2 parents 1a34494 + 8c527b6
commit deb1b3c
Show file tree

Hide file tree

Showing 10 changed files with 65 additions and 42 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,11 @@
 
 ----
 
+## 3.0.0-alpha.6 (2022-09-21)
+
+- feat: support [Intl.Segmenter](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/Segmenter) for segment.
+- dict: 雪茄
+
 ## 3.0.0-alpha.5 (2022-07-12)
 
 - fix: npm publish 中没有保护 esm 目录。

diff --git a/README.en-US.md b/README.en-US.md
@@ -135,11 +135,11 @@ export type IPinyinMode =
 The segment method.
 
 - Default is disable segment: `false`，
-- If set `true`, use "segmentit" module for segment in Web, use "nodejieba" for segment in Node.
-- Also specify follow string for segment (bug just "segmentit" in web):
+- If set `true`, use "Intl.Segmenter" module default for segment on Web and Node.
+- Also specify follow string for segment (bug just "Intl.Segmenter", "segmentit" is support on web):
 
 ```typescript
-export type IPinyinSegment = "nodejieba" | "segmentit" | "@node-rs/jieba";
+export type IPinyinSegment = "Intl.Segmenter" | "nodejieba" | "segmentit" | "@node-rs/jieba";
 ```
 
 
@@ -247,23 +247,6 @@ npm test
 
 ## Q&A
 
-### What's the different Node version and Web version?
-
-`pinyin` support Node and Web browser now, the API and usage is complete same.
-
-But the Web version is simple than Node version. Just frequently-used dict,
-without segmentation, and the dict is compress for web.
-
-Because of Traditional and Segmentation, the convert result will be not complete same.
-and the test case have some different too.
-
-| Feature      | Web version                     | Node version                     |
-|--------------|---------------------------------|----------------------------------|
-| Dict         | Frequently-used Dict, Compress. | Complete Dict, without Compress. |
-| Segmentation | NO                              | Segmentation options.            |
-| Traditional  | NO                              | Full Traditional support.        |
-
-
 ### How to sort by pinyin?
 
 This module provide default compare implementation:

diff --git a/README.md b/README.md
@@ -65,11 +65,11 @@ console.log(pinyin("中心", {
 
 console.log(pinyin("中心", {
   heteronym: true,              // 启用多音字模式
-  segment: true,                // 启用分词，以解决多音字问题。默认不开启，使用 true 开启使用 nodejieba 分词库。
+  segment: true,                // 启用分词，以解决多音字问题。默认不开启，使用 true 开启使用 Intl.Segmenter 分词库。
 }));                            // [ [ 'zhōng' ], [ 'xīn' ] ]
 
 console.log(pinyin("中心", {
-  segment: "@node-rs/jieba",    // 指定分词库，可以是 "nodejieba"、"segmentit"、"@node-rs/jieba"。
+  segment: "@node-rs/jieba",    // 指定分词库，可以是 "Intl.Segmenter", "nodejieba"、"segmentit"、"@node-rs/jieba"。
 }));                            // [ [ 'zhōng' ], [ 'xīn' ] ]
 
 console.log(pinyin("我喜欢你", {
@@ -144,11 +144,11 @@ export type IPinyinMode =
 分词方式。
 
 - 默认关闭 `false`，
-- 也可以设置为 `true` 开启，Web 版中使用 "segmentit" 分词，Node 版中使用 "nodejieba" 分词。
-- 也可以声明以下字符串来指定分词算法。但目前 Web 版只支持 "segmentit" 分词。
+- 也可以设置为 `true` 开启，Web 和 Node 版中均使用 "Intl.Segmenter" 分词。
+- 也可以声明以下字符串来指定分词算法。但目前 Web 版只支持 "Intl.Segmenter" 和 "segmentit" 分词。
 
 ```typescript
-export type IPinyinSegment = "nodejieba" | "segmentit" | "@node-rs/jieba";
+export type IPinyinSegment = "Intl.Segmenter" | "nodejieba" | "segmentit" | "@node-rs/jieba";
 ```
 
 ## API
@@ -178,8 +178,8 @@ options 是可选的，可以设定拼音风格，或打开多音字选项。
 但性能会极大的下降，内存也会使用更多。
 
 - 默认不启用分词。
-- 如果 `segemnt = true`，默认使用 nodejieba 分词。
-- 可以指定 "nodejieba"、"segmentit"、"@node-rs/jieba" 进行分词。
+- 如果 `segemnt = true`，默认使用 Intl.Segmenter 分词。
+- 可以指定 "Intl.Segmenter"、"nodejieba"、"segmentit"、"@node-rs/jieba" 进行分词。
 
 ### `<Boolean> options.heteronym`
 

diff --git a/bin/pinyin b/bin/pinyin
@@ -8,7 +8,7 @@ commander.
   option('-v, --version', 'output the version number').
   option('-s, --style <style>', 'pinyin styles: [NORMAL,TONE,TONE2,INITIALS,FIRST_LETTER]').
   option('-m, --mode <mode>', 'pinyin mode: [NORMAL,SURNAME]').
-  option('-S, --segment [segment]', 'segmentation word to phrases, support "nodejieba", "@node-rs/jieba", "segmentit"').
+  option('-S, --segment [segment]', 'segmentation word to phrases, support "Intl.Segmenter", "nodejieba", "@node-rs/jieba", "segmentit"').
   option('-h, --heteronym', 'output heteronym pinyins').
   option('-g, --group', 'output group by phrases').
   option('-c, --compact', 'output the compact pinyin result').
@@ -20,7 +20,7 @@ if (commander.list) {
 }
 
 // --segment <segment> 是可选项，当后面带的不合法的 segment 时，当作文本处理。
-const validSegmentList = ["nodejieba", "@node-rs/jieba", "segmentit", true, false, undefined];
+const validSegmentList = ["Intl.Segmenter", "nodejieba", "@node-rs/jieba", "segmentit", true, false, undefined];
 if (!validSegmentList.includes(commander.segment)) {
   commander.args.splice(0, 0, commander.segment);
   commander.segment = true;

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "pinyin",
-  "version": "3.0.0-alpha.5",
+  "version": "3.0.0-alpha.6",
   "description": "汉语拼音转换工具。",
   "main": "./lib/pinyin.js",
   "module": "./esm/pinyin.js",

diff --git a/src/data/phrases-dict.ts b/src/data/phrases-dict.ts
@@ -37144,6 +37144,7 @@ const phrases_dict: Record<string,string[][]> = {
 "学识渊博": [["xué"], ["shí"], ["yuān"], ["bó"]],
 "学疏才浅": [["xué"], ["shū"], ["cái"], ["qiǎn"]],
 "学术界": [["xué"], ["shù"], ["jiè"]],
+"雪茄": [["xuě"], ["jiā"]],
 "雪北香南": [["xuě"], ["běi"], ["xiāng"], ["nán"]],
 "雪耻报仇": [["xuě"], ["chǐ"], ["bào"], ["chóu"]],
 "雪窗萤几": [["xuě"], ["chuāng"], ["yíng"], ["jǐ"]],

diff --git a/src/segment-web.ts b/src/segment-web.ts
@@ -22,7 +22,7 @@ export function segment(hans: string, segment?: IPinyinSegment): string[] {
 
   // Intl.Segmenter
   if (segment === "Intl.Segmenter") {
-    if (Intl.Segmenter) {
+    if (typeof Intl?.Segmenter === "function") {
       if (!hansIntlSegmenter) {
         hansIntlSegmenter = new Intl.Segmenter("zh-Hans-CN", {
           granularity: "word",

diff --git a/src/segment.ts b/src/segment.ts
@@ -35,7 +35,7 @@ export function segment(hans: string, segment?: IPinyinSegment): string[] {
 
   // Intl.Segmenter
   if (segment === "Intl.Segmenter") {
-    if (Intl.Segmenter) {
+    if (typeof Intl?.Segmenter === "function") {
       if (!hansIntlSegmenter) {
         hansIntlSegmenter = new Intl.Segmenter("zh-Hans-CN", {
           granularity: "word",

diff --git a/src/util.ts b/src/util.ts
@@ -70,7 +70,7 @@ export function convertUserOptions(options?: IPinyinOptions): IPinyinAllOptions
   let segment: IPinyinSegment | undefined = undefined;
   if (options?.segment) {
     if (options?.segment === true) {
-      segment = "nodejieba";
+      segment = "Intl.Segmenter";
     } else {
       segment = options.segment;
     }

diff --git a/test/test.ts b/test/test.ts
@@ -136,6 +136,7 @@ const cases: any[] = [
   // 英文
   [ "a", {
     STYLE_NORMAL:       [["a"]],
+    STYLE_PASSPORT:     [["a"]],
     STYLE_TONE:         [["a"]],
     STYLE_TONE2:        [["a"]],
     STYLE_TO3NE:        [["a"]],
@@ -144,6 +145,7 @@ const cases: any[] = [
   } ],
   [ "aa", {
     STYLE_NORMAL:       [["aa"]],
+    STYLE_PASSPORT:     [["aa"]],
     STYLE_TONE:         [["aa"]],
     STYLE_TONE2:        [["aa"]],
     STYLE_TO3NE:        [["aa"]],
@@ -152,6 +154,7 @@ const cases: any[] = [
   } ],
   [ "a a", {
     STYLE_NORMAL:       [["a a"]],
+    STYLE_PASSPORT:     [["a a"]],
     STYLE_TONE:         [["a a"]],
     STYLE_TONE2:        [["a a"]],
     STYLE_TO3NE:        [["a a"]],
@@ -160,6 +163,7 @@ const cases: any[] = [
   } ],
   [ "一 一", {
     STYLE_NORMAL:       [["yi"], [" "], ["yi"]],
+    STYLE_PASSPORT:     [["YI"], [" "], ["YI"]],
     STYLE_TONE:         [["yī"], [" "], ["yī"]],
     STYLE_TONE2:        [["yi1"], [" "], ["yi1"]],
     STYLE_TO3NE:        [["yi1"], [" "], ["yi1"]],
@@ -170,6 +174,7 @@ const cases: any[] = [
   // 中英混合
   [ "拼音(pinyin)", {
     STYLE_NORMAL:       [["pin"], ["yin"], ["(pinyin)"]],
+    STYLE_PASSPORT:     [["PIN"], ["YIN"], ["(pinyin)"]],
     STYLE_TONE:         [["pīn"], ["yīn"], ["(pinyin)"]],
     STYLE_TONE2:        [["pin1"], ["yin1"], ["(pinyin)"]],
     STYLE_TO3NE:        [["pi1n"], ["yi1n"], ["(pinyin)"]],
@@ -180,6 +185,7 @@ const cases: any[] = [
   // 中英混合，多音字，单音词。
   [ "中国(china)", {
     STYLE_NORMAL:       [["zhong"], ["guo"], ["(china)"]],
+    STYLE_PASSPORT:     [["ZHONG"], ["GUO"], ["(china)"]],
     STYLE_TONE:         {
       normal: [["zhōng", "zhòng"], ["guó"], ["(china)"]],
       segment: [["zhōng"], ["guó"], ["(china)"]],
@@ -205,6 +211,10 @@ const cases: any[] = [
       normal: [["páng", "fǎng"], ["huáng"]],
       segment: [["páng"], ["huáng"]],
     },
+    STYLE_PASSPORT:     {
+      normal: [["PANG", "FANG"], ["HUANG"]],
+      segment: [["PANG"], ["HUANG"]],
+    },
     STYLE_TONE2:        {
       normal: [["pang2", "fang3"], ["huang2"]],
       segment: [["pang2"], ["huang2"]],
@@ -226,6 +236,7 @@ const cases: any[] = [
   // 中英混合，多音字，单音词。
   [ "0套价", {
     STYLE_NORMAL:       [["0"], ["tao"], ["jia", "jie"]],
+    STYLE_PASSPORT:     [["0"], ["TAO"], ["JIA", "JIE"]],
     STYLE_TONE:         [["0"], ["tào"], ["jià", "jiè", "jie"]],
     STYLE_TONE2:        [["0"], ["tao4"], ["jia4", "jie4", "jie"]],
     STYLE_TO3NE:        [["0"], ["ta4o"], ["jia4", "jie4", "jie"]],
@@ -235,12 +246,34 @@ const cases: any[] = [
 
   // 其他
   [ "女流氓", {
-    STYLE_NORMAL:       [["nv", "ru"], ["liu"], ["mang", "meng"]],
-    STYLE_TONE:         [["nǚ", "rǔ"], ["liú"], ["máng", "méng"]],
-    STYLE_TONE2:        [["nv3", "ru3"], ["liu2"], ["mang2", "meng2"]],
-    STYLE_TO3NE:        [["nv3", "ru3"], ["liu2"], ["ma2ng", "me2ng"]],
-    STYLE_INITIALS:     [["n", "r"], ["l"], ["m"]],
-    STYLE_FIRST_LETTER: [["n", "r"], ["l"], ["m"]],
+    STYLE_NORMAL:       {
+      normal: [["nv", "ru"], ["liu"], ["mang", "meng"]],
+      segment: [["nv", "ru"], ["liu"], ["mang"]],
+    },
+    STYLE_PASSPORT:       {
+      normal: [["NYU", "RU"], ["LIU"], ["MANG", "MENG"]],
+      segment: [["NYU", "RU"], ["LIU"], ["MANG"]],
+    },
+    STYLE_TONE:         {
+      normal: [["nǚ", "rǔ"], ["liú"], ["máng", "méng"]],
+      segment: [["nǚ", "rǔ"], ["liú"], ["máng"]],
+    },
+    STYLE_TONE2:        {
+      normal: [["nv3", "ru3"], ["liu2"], ["mang2", "meng2"]],
+      segment: [["nv3", "ru3"], ["liu2"], ["mang2"]],
+    },
+    STYLE_TO3NE:        {
+      normal: [["nv3", "ru3"], ["liu2"], ["ma2ng", "me2ng"]],
+      segment: [["nv3", "ru3"], ["liu2"], ["ma2ng"]],
+    },
+    STYLE_INITIALS:     {
+      normal: [["n", "r"], ["l"], ["m"]],
+      segment: [["n", "r"], ["l"], ["m"]],
+    },
+    STYLE_FIRST_LETTER: {
+      normal: [["n", "r"], ["l"], ["m"]],
+      segment: [["n", "r"], ["l"], ["m"]],
+    },
   } ],
 ];
 
@@ -255,6 +288,7 @@ describe("姓名模式", function() {
   it("单姓", function() {
     expect(pinyin("华夫人", { mode: "NORMAL"})).toEqual([["huá"], ["fū"], ["rén"]]);
     expect(pinyin("华夫人", { mode: "SURNAME"})).toEqual([["huà"], ["fū"], ["rén"]]);
+    expect(pinyin("吕布", { mode: "SURNAME", style: "passport" })).toEqual([["LYU"], ["BU"]]);
   });
 });
 
@@ -343,9 +377,9 @@ describe("pinyin group", function() {
   });
 
   it("落叶落下着落", function() {
-    const han = "落叶落下着落";
+    const han = "落叶落下";
     const py = pinyin(han, {segment: true, group: true, heteronym: false});
-    expect(py).toEqual([["luòyè"], ["làxià"], ["zháoluò"]]);
+    expect(py).toEqual([["luòyè"], ["làxià"]]);
   });
 });
 
@@ -387,7 +421,7 @@ describe("pinyin compact", function() {
     const py = pinyin(han, { style: "FIRST_LETTER", heteronym: true, compact: true });
     expect(py).toEqual([["w", "m", "d", "a", "z", "y"], ["w", "m", "d", "a", "c", "y"]]);
   });
-  
+
   it("行不行 compact without heternonyms, normal style", function() {
     const han = "行不行";
     const py = pinyin(han, { style: pinyin.STYLE_NORMAL, segment: true, heteronym: false, compact: true });