From f5002c104a8e988cb86240ea751d17a1b7a91bef Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Thu, 30 Sep 2021 14:36:22 +0900 Subject: [PATCH 01/87] Markdown: Do not insert spaces between Chinese/Japanese & latin letters (#6385) Powered by #8526 & prettier-plugin-md-nocjsp --- changelog_unreleased/markdown/11597.md | 76 +++++++++++++++++++ src/language-markdown/constants.evaluate.js | 6 +- src/language-markdown/utils.js | 21 +---- .../__snapshots__/jsfmt.spec.js.snap | 23 +++--- .../__snapshots__/jsfmt.spec.js.snap | 61 ++++++++++++--- .../markdown/splitCjkText/han-kana-alnum.md | 7 ++ 6 files changed, 149 insertions(+), 45 deletions(-) create mode 100644 changelog_unreleased/markdown/11597.md create mode 100644 tests/format/markdown/splitCjkText/han-kana-alnum.md diff --git a/changelog_unreleased/markdown/11597.md b/changelog_unreleased/markdown/11597.md new file mode 100644 index 000000000000..0699b1b964e2 --- /dev/null +++ b/changelog_unreleased/markdown/11597.md @@ -0,0 +1,76 @@ +#### No more inserting space between Chinese or Japanese (e.g. hanzi and kana) and western characters (#11597 by @tats-u) + +The current behavior of inserting whitespace (U+0020) between Chinese or Japanese (e.g. hanzi/kanji and kana) and western (e.g. alphanumerics) characters is not based on the official layout guidelines in Japanese and Chinese but [non-standard and local one in Chinese](https://github.com/ruanyf/document-style-guide/blob/master/docs/text.md). + +Official Japanese guideline (W3C): + +> 3.9.1 Differences in Positioning of Characters and Symbols +> +> The positioning of characters and symbols may vary depending on the following. +> +> d. Are characters and symbols appearing in sequence in solid setting, or will there be a fixed size space between them? For example, sequences of ideographic characters (cl-19) and hiragana (cl-15) are set solid, and for Western characters (cl-27) following hiragana (cl-15) there will be quarter em spacing. + + + +> “one quarter em” means one quarter of the full-width size. (JIS Z 8125) +> “one quarter em space” means amount of space that is one quarter size of em space. + + + + +Official Japanese guideline (JIS X 4051:2004): + +> 4.7 和欧文混植処理 +> +> a) 横書きでは,和文と欧文との間の空き量は,四分アキを原則とする。 +> +> 4.7 Mixed Japanese and Western Text Composition +> +> a) In horizontal writing, the space between Japanese and western text should be one quarter em, as a rule. +> +> PR Author's Note: Original text is written only in Japanese and translation is based on [DeepL](https://www.deepl.com/translator). + + (Japanese) + +Official Chinese guideline (W3C): + +> 3.2.2 Mixed Text Composition in Horizontal Writing Mode +> +> In principle, there is tracking or spacing between an adjacent Han character and a Western character of up to one quarter of a Han character width, except at the line start or end. +> +> NOTE: Another approach is to use a Western word space (U+0020 SPACE), in which case the width depends on the font in use. + + + +As mentioned above, whitespace (U+0020) is allowed to be substituted for one quarter em only in Chinese, although they have a similar appearance. Also, even in Chinese, the rule is not adopted even in the W3C guideline page but is mentioned as just one of the options. + +Some renderers (e.g. convert to PDF using Pandoc with the backend of LaTeX) can automatically insert genuine one quarter em. The width of whitespace is different from one quarter em, so inserting whitespace (U+0020) takes away the option to leave it to renderers to insert one quarter em. Adding space should be left to renderers and should not be done by Prettier, just a formatter. + + +Adding whitespace may interfere with searches for text containing both Chinese or Japanese and western characters. For example, you cannot find “第1章” (Chapter 1) in a Markdown document or its derivative just by searching by the string “第1章” but “第 1 章”. + +To make matters worst, once whitespace is inserted, it is difficult to remove it. The following sentence cannot be said to be wrong. + +> 作る means make in Japanese. + +The too simple rule of removing whitespace between Chinese or Japanese characters and alphanumerics removes that between “作る” and “means” unless you modify the sentence, that is, quote “作る”. It is so difficult to create a common rule that can safely remove whitespace from all documents and deserves to be included in Prettier. + +In conclusion, the imposition of the non-standard rule by just a formatter must be ended. + + +```markdown + +漢字Alphabetsひらがな12345カタカナ67890한글 + +漢字 Alphabets ひらがな 12345 カタカナ 67890 한글 + + +漢字 Alphabets ひらがな 12345 カタカナ 67890한글 + +漢字 Alphabets ひらがな 12345 カタカナ 67890 한글 + + +漢字Alphabetsひらがな12345カタカナ67890한글 + +漢字 Alphabets ひらがな 12345 カタカナ 67890 한글 +``` diff --git a/src/language-markdown/constants.evaluate.js b/src/language-markdown/constants.evaluate.js index 8f3f14480660..331880dba40e 100644 --- a/src/language-markdown/constants.evaluate.js +++ b/src/language-markdown/constants.evaluate.js @@ -20,10 +20,6 @@ const cjkPattern = `(?:${cjkRegex() Block: ["Variation_Selectors", "Variation_Selectors_Supplement"], }).toString()})?`; -const kPattern = unicodeRegex({ Script: ["Hangul"] }) - .union(unicodeRegex({ Script_Extensions: ["Hangul"] })) - .toString(); - // http://spec.commonmark.org/0.25/#ascii-punctuation-character const asciiPunctuationCharset = /* prettier-ignore */ regexpUtil.charset( @@ -49,4 +45,4 @@ const punctuationCharset = unicodeRegex({ const punctuationPattern = punctuationCharset.toString(); -export { cjkPattern, kPattern, punctuationPattern }; +export { cjkPattern, punctuationPattern }; diff --git a/src/language-markdown/utils.js b/src/language-markdown/utils.js index fa52c1b80060..eaf5f2d96297 100644 --- a/src/language-markdown/utils.js +++ b/src/language-markdown/utils.js @@ -1,9 +1,5 @@ import { locStart, locEnd } from "./loc.js"; -import { - cjkPattern, - kPattern, - punctuationPattern, -} from "./constants.evaluate.js"; +import { cjkPattern, punctuationPattern } from "./constants.evaluate.js"; const INLINE_NODE_TYPES = [ "liquidNode", @@ -33,7 +29,6 @@ const INLINE_NODE_WRAPPER_TYPES = [ "heading", ]; -const kRegex = new RegExp(kPattern); const punctuationRegex = new RegExp(punctuationPattern); /** @@ -42,8 +37,7 @@ const punctuationRegex = new RegExp(punctuationPattern); */ function splitText(text, options) { const KIND_NON_CJK = "non-cjk"; - const KIND_CJ_LETTER = "cj-letter"; - const KIND_K_LETTER = "k-letter"; + const KIND_CJK_LETTER = "cjk-letter"; const KIND_CJK_PUNCTUATION = "cjk-punctuation"; /** @@ -120,7 +114,7 @@ function splitText(text, options) { : { type: "word", value: innerToken, - kind: kRegex.test(innerToken) ? KIND_K_LETTER : KIND_CJ_LETTER, + kind: KIND_CJK_LETTER, hasLeadingPunctuation: false, hasTrailingPunctuation: false, } @@ -134,15 +128,6 @@ function splitText(text, options) { const lastNode = nodes.at(-1); if (lastNode && lastNode.type === "word") { if ( - (lastNode.kind === KIND_NON_CJK && - node.kind === KIND_CJ_LETTER && - !lastNode.hasTrailingPunctuation) || - (lastNode.kind === KIND_CJ_LETTER && - node.kind === KIND_NON_CJK && - !node.hasLeadingPunctuation) - ) { - nodes.push({ type: "whitespace", value: " " }); - } else if ( !isBetween(KIND_NON_CJK, KIND_CJK_PUNCTUATION) && // disallow leading/trailing full-width whitespace ![lastNode.value, node.value].some((value) => /\u3000/.test(value)) diff --git a/tests/format/markdown/paragraph/__snapshots__/jsfmt.spec.js.snap b/tests/format/markdown/paragraph/__snapshots__/jsfmt.spec.js.snap index 6bb45612bc67..7e789a3c56fa 100644 --- a/tests/format/markdown/paragraph/__snapshots__/jsfmt.spec.js.snap +++ b/tests/format/markdown/paragraph/__snapshots__/jsfmt.spec.js.snap @@ -31,16 +31,15 @@ IVS 麻󠄁羽󠄀‼️ 這是一段很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長 很長的段落 -這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 -Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著 -中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 -English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 -Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著 -中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 -English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 -Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著 -中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 -English 混合著中文的一段 Paragraph! +這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段 +Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的 +一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中 +文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合 +著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English +混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個 +English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是 +一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph! +這是一個English混合著中文的一段Paragraph! 全  形 空白全  形 空白全  形 空白全  形 空白全  形 空白 全  形 空白全  形 空白全  形 空白 @@ -90,7 +89,7 @@ IVS 麻󠄁羽󠄀‼️ =====================================output===================================== 這是一段很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長的段落 -這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph! +這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph! 全  形 空白全  形 空白全  形 空白全  形 空白全  形 空白全  形 空白全  形 空白全  形 空白 @@ -139,7 +138,7 @@ IVS 麻󠄁羽󠄀‼️ =====================================output===================================== 這是一段很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長的段落 -這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph! +這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph! 全  形 空白全  形 空白全  形 空白全  形 空白全  形 空白全  形 空白全  形 空白全  形 空白 diff --git a/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap b/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap index a2568471fa0e..82f3467d9cb6 100644 --- a/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap +++ b/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap @@ -38,6 +38,48 @@ proseWrap: "always" ================================================================================ `; +exports[`han-kana-alnum.md - {"proseWrap":"always"} format 1`] = ` +====================================options===================================== +parsers: ["markdown"] +printWidth: 80 +proseWrap: "always" + | printWidth +=====================================input====================================== +這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph! + +Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ! + +MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー! + +Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。 + +=====================================output===================================== +這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段 +Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的 +一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中 +文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合 +著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English +混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個 +English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是 +一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph! +這是一個English混合著中文的一段Paragraph! + +Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ +!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ +!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ +!Englishと日本語が混ざったParagraphだよ! + +MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー +!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー +!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー +!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー! + +Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在 +2016年11月29日。 + +================================================================================ +`; + exports[`korean.md - {"proseWrap":"always"} format 1`] = ` ====================================options===================================== parsers: ["markdown"] @@ -78,16 +120,15 @@ proseWrap: "always" 這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph! =====================================output===================================== -這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 -Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著 -中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 -English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 -Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著 -中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 -English 混合著中文的一段 Paragraph!這是一個 English 混合著中文的一段 -Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 English 混合著 -中文的一段 Paragraph!這是一個 English 混合著中文的一段 Paragraph!這是一個 -English 混合著中文的一段 Paragraph! +這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段 +Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的 +一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中 +文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合 +著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English +混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個 +English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是 +一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph! +這是一個English混合著中文的一段Paragraph! ================================================================================ `; diff --git a/tests/format/markdown/splitCjkText/han-kana-alnum.md b/tests/format/markdown/splitCjkText/han-kana-alnum.md new file mode 100644 index 000000000000..b7a8e4fd7af0 --- /dev/null +++ b/tests/format/markdown/splitCjkText/han-kana-alnum.md @@ -0,0 +1,7 @@ +這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph! + +Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ! + +MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー! + +Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。 From 4ece6337aead0c6e833e00bfd4d615000d90361e Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Mon, 25 Oct 2021 00:19:07 +0900 Subject: [PATCH 02/87] Add "hanzi" (Chinese ideographs) to cspell.json --- cspell.json | 1 + 1 file changed, 1 insertion(+) diff --git a/cspell.json b/cspell.json index bd9ebdb474ff..51ba6ea13f07 100644 --- a/cspell.json +++ b/cspell.json @@ -122,6 +122,7 @@ "gofmt", "Gregor", "Hampus", + "hanzi", "hardline", "hardlines", "hashbang", From 60e675ebe7972ca2c5b312028e7ebd2297fb362a Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 24 Jul 2022 18:53:06 +0900 Subject: [PATCH 03/87] Do not replace LF adjacent to han/kana with space This affects only results when `proseWraps` is `always`. --- src/language-markdown/constants.evaluate.js | 6 +++++- src/language-markdown/printer-markdown.js | 16 +++++++++++++--- src/language-markdown/utils.js | 15 +++++++++++---- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/src/language-markdown/constants.evaluate.js b/src/language-markdown/constants.evaluate.js index 331880dba40e..8f3f14480660 100644 --- a/src/language-markdown/constants.evaluate.js +++ b/src/language-markdown/constants.evaluate.js @@ -20,6 +20,10 @@ const cjkPattern = `(?:${cjkRegex() Block: ["Variation_Selectors", "Variation_Selectors_Supplement"], }).toString()})?`; +const kPattern = unicodeRegex({ Script: ["Hangul"] }) + .union(unicodeRegex({ Script_Extensions: ["Hangul"] })) + .toString(); + // http://spec.commonmark.org/0.25/#ascii-punctuation-character const asciiPunctuationCharset = /* prettier-ignore */ regexpUtil.charset( @@ -45,4 +49,4 @@ const punctuationCharset = unicodeRegex({ const punctuationPattern = punctuationCharset.toString(); -export { cjkPattern, punctuationPattern }; +export { cjkPattern, kPattern, punctuationPattern }; diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index a1139079f06b..4e8826700892 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -136,6 +136,9 @@ function genericPrint(path, options, print) { return escapedValue; } case "whitespace": { + const parentNode = path.getParentNode(); + const index = parentNode.children.indexOf(node); + const previous = parentNode.children[index - 1]; const { next } = path; const proseWrap = @@ -144,7 +147,7 @@ function genericPrint(path, options, print) { ? "never" : options.proseWrap; - return printLine(path, node.value, { proseWrap }); + return printLine(path, node.value, { proseWrap }, { previous, next }); } case "emphasis": { let style; @@ -525,7 +528,7 @@ function getAncestorNode(path, typeOrTypes) { return counter === -1 ? null : path.getParentNode(counter); } -function printLine(path, value, options) { +function printLine(path, value, options, adjacentNodes) { if (options.proseWrap === "preserve" && value === "\n") { return hardline; } @@ -533,7 +536,14 @@ function printLine(path, value, options) { const isBreakable = options.proseWrap === "always" && !getAncestorNode(path, SINGLE_LINE_NODE_TYPES); - return value !== "" + // Chinese and Japanese does not use U+0020 Space to dive words, so U+00A0 No-break space must not be replaced with it. + // Behavior in other languages will not be changed because there are too much things to consider. (PR welcome) + const canLineBreakBeConvertedToSpace = !( + typeof adjacentNodes === "object" && + (adjacentNodes?.previous?.kind === "cj-letter" || + adjacentNodes?.next?.kind === "cj-letter") + ); + return value !== "" && canLineBreakBeConvertedToSpace ? isBreakable ? line : " " diff --git a/src/language-markdown/utils.js b/src/language-markdown/utils.js index eaf5f2d96297..5ea0d7a9015d 100644 --- a/src/language-markdown/utils.js +++ b/src/language-markdown/utils.js @@ -1,5 +1,9 @@ import { locStart, locEnd } from "./loc.js"; -import { cjkPattern, punctuationPattern } from "./constants.evaluate.js"; +import { + cjkPattern, + kPattern, + punctuationPattern, +} from "./constants.evaluate.js"; const INLINE_NODE_TYPES = [ "liquidNode", @@ -29,6 +33,7 @@ const INLINE_NODE_WRAPPER_TYPES = [ "heading", ]; +const kRegex = new RegExp(kPattern); const punctuationRegex = new RegExp(punctuationPattern); /** @@ -37,7 +42,8 @@ const punctuationRegex = new RegExp(punctuationPattern); */ function splitText(text, options) { const KIND_NON_CJK = "non-cjk"; - const KIND_CJK_LETTER = "cjk-letter"; + const KIND_CJ_LETTER = "cj-letter"; + const KIND_K_LETTER = "k-letter"; const KIND_CJK_PUNCTUATION = "cjk-punctuation"; /** @@ -46,7 +52,7 @@ function splitText(text, options) { * | { * type: "word", * value: string, - * kind: string, + * kind: KIND_NON_CJK | KIND_CJ_LETTER | KIND_K_LETTER | KIND_CJK_PUNCTUATION, * hasLeadingPunctuation: boolean, * hasTrailingPunctuation: boolean * } @@ -114,7 +120,8 @@ function splitText(text, options) { : { type: "word", value: innerToken, - kind: KIND_CJK_LETTER, + // Korean uses space to divide words, but Chinese & Japanese do not + kind: kRegex.test(innerToken) ? KIND_K_LETTER : KIND_CJ_LETTER, hasLeadingPunctuation: false, hasTrailingPunctuation: false, } From 8b904ac2961543317d4f8916dade7aa8d8ff8712 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 24 Jul 2022 19:06:03 +0900 Subject: [PATCH 04/87] Don't remove space next to Chinese & Japanese letters --- src/language-markdown/printer-markdown.js | 1 + 1 file changed, 1 insertion(+) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 4e8826700892..d3f6a1802751 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -539,6 +539,7 @@ function printLine(path, value, options, adjacentNodes) { // Chinese and Japanese does not use U+0020 Space to dive words, so U+00A0 No-break space must not be replaced with it. // Behavior in other languages will not be changed because there are too much things to consider. (PR welcome) const canLineBreakBeConvertedToSpace = !( + value === "\n" && typeof adjacentNodes === "object" && (adjacentNodes?.previous?.kind === "cj-letter" || adjacentNodes?.next?.kind === "cj-letter") From 8190ade6b203a36c57de13eb65b557677653c37f Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 6 Aug 2022 20:14:47 +0900 Subject: [PATCH 05/87] Investigate in detail whether Space should be inserted --- src/language-markdown/printer-markdown.js | 83 ++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index d3f6a1802751..b83192dc3089 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -528,6 +528,80 @@ function getAncestorNode(path, typeOrTypes) { return counter === -1 ? null : path.getParentNode(counter); } +/** + * Finds out if Space is tend to be inserted between `字` (ideograph & kana) and `A` (other letters e.g. alphanumerics) in the sentence. + * + * @param {*} path current position in nodes tree + * @returns {boolean} `true` if Space is tend to be inserted between these types of letters, `false` otherwise. + */ +function isSentenceUseCJDividingSpace(path) { + const sentenceNode = getAncestorNode(path, "sentence"); + if (sentenceNode.isCJSpacingUsing !== undefined) { + return sentenceNode.isCJSpacingUsing; + } + + const cjNonCJKSpacingStatistics = { + " ": 0, + "": 0, + }; + for (let i = 0; i < sentenceNode.children.length; ++i) { + const node = sentenceNode.children[i]; + if (node.type === "whitespace") { + switch (node.value) { + case " ": + case "": { + const previous = sentenceNode.children[i - 1]; + const next = sentenceNode.children[i + 1]; + if ( + !( + (previous?.kind === "cj-letter" && next?.kind === "non-cjk") || + (previous?.kind === "non-cjk" && next?.kind === "cj-letter") + ) + ) { + continue; + } + ++cjNonCJKSpacingStatistics[node.value]; + continue; + } + } + } + } + // Injects a property to cache the result. + sentenceNode.isCJSpacingUsing = + cjNonCJKSpacingStatistics[" "] > cjNonCJKSpacingStatistics[""]; + return sentenceNode.isCJSpacingUsing; +} + +/** + * Looks for 1st `:::` (MUST be followed by a Space) from 2nd `:::`. + * + * ``` + * ::: foo + * bar + * ::: + * ``` + * + * @param {*} path current position in nodes tree + * @param {*} mark what to look for + * @returns `true` if a `mark` followed by Space (U+0020) is found before `path`, `false` otherwise. + */ +function isCorrespondingMarkFollowedBySpaceBefore(path, mark) { + const sentenceNode = getAncestorNode(path, "sentence"); + if ( + sentenceNode.children + .slice(0, getPenultimate(path.stack)) + .some( + (node, i, array) => + node.value === mark && + array[i + 1]?.type === "whitespace" && + array[i + 1]?.value === " " + ) + ) { + return true; + } + return false; +} + function printLine(path, value, options, adjacentNodes) { if (options.proseWrap === "preserve" && value === "\n") { return hardline; @@ -542,7 +616,14 @@ function printLine(path, value, options, adjacentNodes) { value === "\n" && typeof adjacentNodes === "object" && (adjacentNodes?.previous?.kind === "cj-letter" || - adjacentNodes?.next?.kind === "cj-letter") + adjacentNodes?.next?.kind === "cj-letter") && + // we wonder if there are other marks to be considered. + (adjacentNodes.next.value !== ":::" || + !isCorrespondingMarkFollowedBySpaceBefore( + path, + adjacentNodes.next.value + )) && + !isSentenceUseCJDividingSpace(path) ); return value !== "" && canLineBreakBeConvertedToSpace ? isBreakable From 567f0a7fb07b4a05e90f0a509f6f7278b7f6b01a Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Fri, 26 Aug 2022 00:44:25 +0900 Subject: [PATCH 06/87] Update src/language-markdown/printer-markdown.js Co-authored-by: Sosuke Suzuki --- src/language-markdown/printer-markdown.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index b83192dc3089..f07f0e6f32a6 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -610,7 +610,7 @@ function printLine(path, value, options, adjacentNodes) { const isBreakable = options.proseWrap === "always" && !getAncestorNode(path, SINGLE_LINE_NODE_TYPES); - // Chinese and Japanese does not use U+0020 Space to dive words, so U+00A0 No-break space must not be replaced with it. + // Chinese and Japanese does not use U+0020 Space to divide words, so U+00A0 No-break space must not be replaced with it. // Behavior in other languages will not be changed because there are too much things to consider. (PR welcome) const canLineBreakBeConvertedToSpace = !( value === "\n" && From d0e71b559030ed3b4d76082fa7069b93d0229b3f Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 27 Aug 2022 20:12:45 +0900 Subject: [PATCH 07/87] Improve conclusion --- changelog_unreleased/markdown/11597.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog_unreleased/markdown/11597.md b/changelog_unreleased/markdown/11597.md index 0699b1b964e2..78684efc97ea 100644 --- a/changelog_unreleased/markdown/11597.md +++ b/changelog_unreleased/markdown/11597.md @@ -55,7 +55,7 @@ To make matters worst, once whitespace is inserted, it is difficult to remove it The too simple rule of removing whitespace between Chinese or Japanese characters and alphanumerics removes that between “作る” and “means” unless you modify the sentence, that is, quote “作る”. It is so difficult to create a common rule that can safely remove whitespace from all documents and deserves to be included in Prettier. -In conclusion, the imposition of the non-standard rule by just a formatter must be ended. +For these reasons, we stopped inserting space. ```markdown From ddeab05a2e89a9fc69f8a062ed7d774ff25fca22 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 27 Aug 2022 20:25:09 +0900 Subject: [PATCH 08/87] Improve JSDoc --- src/language-markdown/printer-markdown.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index f07f0e6f32a6..9d9feff6f8a2 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -529,7 +529,9 @@ function getAncestorNode(path, typeOrTypes) { } /** - * Finds out if Space is tend to be inserted between `字` (ideograph & kana) and `A` (other letters e.g. alphanumerics) in the sentence. + * Finds out if Space is tend to be inserted between Chinese or Japanese charaters + * (including ideograph aka han or kanji e.g. `字`, hiragana e.g. `あ`, and katakana e.g. `ア`) + * and other letters (including alphanumerics; e.g. `A` or `1`) in the sentence. * * @param {*} path current position in nodes tree * @returns {boolean} `true` if Space is tend to be inserted between these types of letters, `false` otherwise. From 1500e2b8b321f77b474e2556cf9d0f87ccdcfe61 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 4 Sep 2022 14:15:23 +0900 Subject: [PATCH 09/87] Refactor `printLine` & export text node kinds --- src/language-markdown/printer-markdown.js | 101 ++++++++++++++++++---- src/language-markdown/utils.js | 45 ++++++---- 2 files changed, 110 insertions(+), 36 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 9d9feff6f8a2..d4bbbd341f63 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -36,6 +36,8 @@ import { INLINE_NODE_TYPES, INLINE_NODE_WRAPPER_TYPES, isAutolink, + KIND_CJK_PUNCTUATION, + KIND_CJ_LETTER, } from "./utils.js"; import visitorKeys from "./visitor-keys.js"; @@ -604,6 +606,69 @@ function isCorrespondingMarkFollowedBySpaceBefore(path, mark) { return false; } +/** + * @typedef {import("./utils.js").TextNode} TextNode + * @typedef {import("./utils.js").WhiteSpaceValue} WhiteSpaceValue + * @typedef {{next?: TextNode | undefined, previous?: TextNode | undefined}} AdjacentNodes + */ + +/** + * Checks if given node can be converted to Space + * + * For example, if you would like to squash the multi-line string `"You might want\nto use Prettier."` into a single line, + * you would replace `"\n"` with `" "`. (`"You might want to use Prettier."`) + * + * However, you should note that Chinese and Japanese does not use U+0020 Space to divide words, so U+00A0 No-break space must not be replaced with it. + * Behavior in other languages (e.g. Thai) will not be changed because there are too much things to consider. (PR welcome) + * + * @param {*} path path of given node + * @param {WhiteSpaceValue} value value of given node (typically `" "` or `"\n"`) + * @param {AdjacentNodes | undefined} adjacentNodes adjacent sibling nodes of given node + * @returns {boolean} `true` if given node can be converted to space, `false` if not (i.e. newline or empty character) + */ +function canBeConvertedToSpace(path, value, adjacentNodes) { + return ( + // "\n" or " ", of course " " always can be converted to Space + value !== "\n" || + // no adjacent nodes + typeof adjacentNodes !== "object" || + // "\n" between non-CJ (not han or kana) chracters always can converted to Space + (adjacentNodes.previous?.kind !== KIND_CJ_LETTER && + adjacentNodes.next?.kind !== KIND_CJ_LETTER) || + (!( + // Do not convert it to Space when: + // "\n" between CJ always SHALL NOT be converted to space + ( + (adjacentNodes.previous?.kind === KIND_CJ_LETTER && + adjacentNodes.next?.kind === KIND_CJ_LETTER) || + // Shall not be converted to Space around CJK punctuation + adjacentNodes.previous?.kind === KIND_CJK_PUNCTUATION || + adjacentNodes.next?.kind === KIND_CJK_PUNCTUATION + ) + ) && + // The following rules do not precede the above rules. + // + // 1. "\n" between special signs and CJ characters + // [corresponding sign][Space][any string][CJ][[\n]][target sign] + // we wonder if there are other marks to be considered. + ((adjacentNodes.next.value === ":::" && + isCorrespondingMarkFollowedBySpaceBefore( + path, + adjacentNodes.next.value + )) || + // 2. If sentence uses space between CJ and alphanumerics (including hangul because of backward-compatibility), + // "\n" can be converted to Space. + // Note: Koran uses space to divide words, so it is difficult to determine if "\n" should be converted to Space. + isSentenceUseCJDividingSpace(path))) + ); +} + +/** + * @param {*} path + * @param {WhiteSpaceValue} value + * @param {*} options + * @param {AdjacentNodes | undefined} [adjacentNodes] + */ function printLine(path, value, options, adjacentNodes) { if (options.proseWrap === "preserve" && value === "\n") { return hardline; @@ -612,28 +677,26 @@ function printLine(path, value, options, adjacentNodes) { const isBreakable = options.proseWrap === "always" && !getAncestorNode(path, SINGLE_LINE_NODE_TYPES); + + // Space or empty + if (value !== "\n") { + return convertToLineIfBreakable(value); + } // Chinese and Japanese does not use U+0020 Space to divide words, so U+00A0 No-break space must not be replaced with it. // Behavior in other languages will not be changed because there are too much things to consider. (PR welcome) - const canLineBreakBeConvertedToSpace = !( - value === "\n" && - typeof adjacentNodes === "object" && - (adjacentNodes?.previous?.kind === "cj-letter" || - adjacentNodes?.next?.kind === "cj-letter") && - // we wonder if there are other marks to be considered. - (adjacentNodes.next.value !== ":::" || - !isCorrespondingMarkFollowedBySpaceBefore( - path, - adjacentNodes.next.value - )) && - !isSentenceUseCJDividingSpace(path) + return convertToLineIfBreakable( + canBeConvertedToSpace(path, value, adjacentNodes) ? " " : "" ); - return value !== "" && canLineBreakBeConvertedToSpace - ? isBreakable - ? line - : " " - : isBreakable - ? softline - : ""; + + /** + * @param value {" " | ""} + */ + function convertToLineIfBreakable(value) { + if (!isBreakable) { + return value; + } + return value === " " ? line : softline; + } } function printTable(path, options, print) { diff --git a/src/language-markdown/utils.js b/src/language-markdown/utils.js index 5ea0d7a9015d..96a8e903310a 100644 --- a/src/language-markdown/utils.js +++ b/src/language-markdown/utils.js @@ -36,28 +36,35 @@ const INLINE_NODE_WRAPPER_TYPES = [ const kRegex = new RegExp(kPattern); const punctuationRegex = new RegExp(punctuationPattern); +const KIND_NON_CJK = "non-cjk"; +const KIND_CJ_LETTER = "cj-letter"; +const KIND_K_LETTER = "k-letter"; +const KIND_CJK_PUNCTUATION = "cjk-punctuation"; + +/** + * @typedef {" " | "\n" | ""} WhiteSpaceValue + * @typedef { KIND_NON_CJK | KIND_CJ_LETTER | KIND_K_LETTER | KIND_CJK_PUNCTUATION} WordKind + * @typedef {{ + * type: "whitespace", + * value: WhiteSpaceValue, + * kind?: undefined, + * hasLeadingPunctuation?: undefined, + * hasTrailingPunctuation?: undefined, + * } | { + * type: "word", + * value: string, + * kind: WordKind, + * hasLeadingPunctuation: boolean, + * hasTrailingPunctuation: boolean, + * }} TextNode + */ + /** * split text into whitespaces and words * @param {string} text */ function splitText(text, options) { - const KIND_NON_CJK = "non-cjk"; - const KIND_CJ_LETTER = "cj-letter"; - const KIND_K_LETTER = "k-letter"; - const KIND_CJK_PUNCTUATION = "cjk-punctuation"; - - /** - * @type {Array< - * { type: "whitespace", value: " " | "\n" | "" } - * | { - * type: "word", - * value: string, - * kind: KIND_NON_CJK | KIND_CJ_LETTER | KIND_K_LETTER | KIND_CJK_PUNCTUATION, - * hasLeadingPunctuation: boolean, - * hasTrailingPunctuation: boolean - * } - * >} - */ + /** @type {Array} */ const nodes = []; const tokens = ( @@ -240,4 +247,8 @@ export { INLINE_NODE_TYPES, INLINE_NODE_WRAPPER_TYPES, isAutolink, + KIND_NON_CJK, + KIND_CJ_LETTER, + KIND_K_LETTER, + KIND_CJK_PUNCTUATION, }; From b36af63b7a1d6077e3ed509d9b0431b3436f9736 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 4 Sep 2022 14:16:22 +0900 Subject: [PATCH 10/87] Add type to argument of `isCorrespondingMarkFollowedBySpaceBefore` --- src/language-markdown/printer-markdown.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index d4bbbd341f63..ffcf956e005b 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -586,7 +586,7 @@ function isSentenceUseCJDividingSpace(path) { * ``` * * @param {*} path current position in nodes tree - * @param {*} mark what to look for + * @param {string} mark what to look for * @returns `true` if a `mark` followed by Space (U+0020) is found before `path`, `false` otherwise. */ function isCorrespondingMarkFollowedBySpaceBefore(path, mark) { From 3399944281664512965b20af8f6d43bfb126d104 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 4 Sep 2022 14:37:31 +0900 Subject: [PATCH 11/87] Fix typo --- src/language-markdown/printer-markdown.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index ffcf956e005b..5e89cc86e5dc 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -632,7 +632,7 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { value !== "\n" || // no adjacent nodes typeof adjacentNodes !== "object" || - // "\n" between non-CJ (not han or kana) chracters always can converted to Space + // "\n" between non-CJ (not han or kana) characters always can converted to Space (adjacentNodes.previous?.kind !== KIND_CJ_LETTER && adjacentNodes.next?.kind !== KIND_CJ_LETTER) || (!( From 1eafeead9346f84ed6fe807ebf3bc40166715036 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 4 Sep 2022 18:36:30 +0900 Subject: [PATCH 12/87] Fix typo #2 (by @sosukesuzuki) Co-authored-by: Sosuke Suzuki --- src/language-markdown/printer-markdown.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 5e89cc86e5dc..e7357e824226 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -531,7 +531,7 @@ function getAncestorNode(path, typeOrTypes) { } /** - * Finds out if Space is tend to be inserted between Chinese or Japanese charaters + * Finds out if Space is tend to be inserted between Chinese or Japanese characters * (including ideograph aka han or kanji e.g. `字`, hiragana e.g. `あ`, and katakana e.g. `ア`) * and other letters (including alphanumerics; e.g. `A` or `1`) in the sentence. * From b994f3aee38726e3e79a847c99e94cf77dd0b647 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 4 Sep 2022 21:54:26 +0900 Subject: [PATCH 13/87] Make sure not to insert Space around CJK punctuation --- src/language-markdown/printer-markdown.js | 83 +++++++++++++---------- 1 file changed, 49 insertions(+), 34 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index e7357e824226..993efe768e3e 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -627,40 +627,55 @@ function isCorrespondingMarkFollowedBySpaceBefore(path, mark) { * @returns {boolean} `true` if given node can be converted to space, `false` if not (i.e. newline or empty character) */ function canBeConvertedToSpace(path, value, adjacentNodes) { - return ( - // "\n" or " ", of course " " always can be converted to Space - value !== "\n" || - // no adjacent nodes - typeof adjacentNodes !== "object" || - // "\n" between non-CJ (not han or kana) characters always can converted to Space - (adjacentNodes.previous?.kind !== KIND_CJ_LETTER && - adjacentNodes.next?.kind !== KIND_CJ_LETTER) || - (!( - // Do not convert it to Space when: - // "\n" between CJ always SHALL NOT be converted to space - ( - (adjacentNodes.previous?.kind === KIND_CJ_LETTER && - adjacentNodes.next?.kind === KIND_CJ_LETTER) || - // Shall not be converted to Space around CJK punctuation - adjacentNodes.previous?.kind === KIND_CJK_PUNCTUATION || - adjacentNodes.next?.kind === KIND_CJK_PUNCTUATION - ) - ) && - // The following rules do not precede the above rules. - // - // 1. "\n" between special signs and CJ characters - // [corresponding sign][Space][any string][CJ][[\n]][target sign] - // we wonder if there are other marks to be considered. - ((adjacentNodes.next.value === ":::" && - isCorrespondingMarkFollowedBySpaceBefore( - path, - adjacentNodes.next.value - )) || - // 2. If sentence uses space between CJ and alphanumerics (including hangul because of backward-compatibility), - // "\n" can be converted to Space. - // Note: Koran uses space to divide words, so it is difficult to determine if "\n" should be converted to Space. - isSentenceUseCJDividingSpace(path))) - ); + // "\n" or " ", of course " " always can be converted to Space + if (value !== "\n") { + return true; + } + // no adjacent nodes + if (typeof adjacentNodes !== "object") { + return true; + } + // "\n" between non-CJ (not han, kana, CJK punctuations) characters always can converted to Space + if ( + notCJLikeKind(adjacentNodes.previous?.kind) && + notCJLikeKind(adjacentNodes.next?.kind) + ) { + return true; + } + // Do not convert it to Space when: + if ( + // "\n" between CJ always SHALL NOT be converted to space + (adjacentNodes.previous?.kind === KIND_CJ_LETTER && + adjacentNodes.next?.kind === KIND_CJ_LETTER) || + // Shall not be converted to Space around CJK punctuation + adjacentNodes.previous?.kind === KIND_CJK_PUNCTUATION || + adjacentNodes.next?.kind === KIND_CJK_PUNCTUATION + ) { + return false; + } + // The following rules do not precede the above rules (`return false`). + // + // 1. "\n" between special signs and CJ characters + // [corresponding sign][Space][any string][CJ][[\n]][target sign] + // we wonder if there are other marks to be considered. + if ( + adjacentNodes.next.value === ":::" && + isCorrespondingMarkFollowedBySpaceBefore(path, adjacentNodes.next.value) + ) { + return true; + } + // 2. If sentence uses space between CJ and alphanumerics (including hangul because of backward-compatibility), + // "\n" can be converted to Space. + // Note: Koran uses space to divide words, so it is difficult to determine if "\n" should be converted to Space. + return isSentenceUseCJDividingSpace(path); +} + +/** + * @param {import("./utils.js").WordKind | undefined} kind + * @returns {boolean} `true` if `kind` is what is used in Chinese & Japanese (han, kana, and CJK punctuations) + */ +function notCJLikeKind(kind) { + return kind !== KIND_CJ_LETTER && kind !== KIND_CJK_PUNCTUATION; } /** From a94933e4060bd5c23f47834236602cb52778b6e0 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Mon, 19 Sep 2022 21:49:48 +0900 Subject: [PATCH 14/87] Fix missing space in JSDoc type definition --- src/language-markdown/utils.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/language-markdown/utils.js b/src/language-markdown/utils.js index 96a8e903310a..3278da466fe8 100644 --- a/src/language-markdown/utils.js +++ b/src/language-markdown/utils.js @@ -43,7 +43,7 @@ const KIND_CJK_PUNCTUATION = "cjk-punctuation"; /** * @typedef {" " | "\n" | ""} WhiteSpaceValue - * @typedef { KIND_NON_CJK | KIND_CJ_LETTER | KIND_K_LETTER | KIND_CJK_PUNCTUATION} WordKind + * @typedef { KIND_NON_CJK | KIND_CJ_LETTER | KIND_K_LETTER | KIND_CJK_PUNCTUATION } WordKind * @typedef {{ * type: "whitespace", * value: WhiteSpaceValue, From 40b9023aa09525525eb4e1fc085bca226c8c1643 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Mon, 19 Sep 2022 23:47:34 +0900 Subject: [PATCH 15/87] Improve handling newline around CJK characters - Line breaking rules in CJK - Do not insert newline in Korean words (#6516) - Inserting Space rule between CJK and symbols --- src/language-markdown/printer-markdown.js | 217 ++++++++++++++++------ 1 file changed, 162 insertions(+), 55 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 993efe768e3e..8944f252ce4e 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -38,6 +38,8 @@ import { isAutolink, KIND_CJK_PUNCTUATION, KIND_CJ_LETTER, + KIND_NON_CJK, + KIND_K_LETTER, } from "./utils.js"; import visitorKeys from "./visitor-keys.js"; @@ -54,6 +56,42 @@ const SIBLING_NODE_TYPES = new Set([ "definition", "footnoteDefinition", ]); +// https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages +/** + * The set of characters that must not immediately precede a line break + * + * e.g. `"("` + * + * - Bad: `"檜原村(\nひのはらむら)"` + * - Good: `"檜原村\n(ひのはらむら)"` or ``"檜原村(ひ\nのはらむら)"` + */ +const noBreakAfterSymbolSet = new Set( + "$(£¥·'\"〈《「『【〔〖〝﹙﹛$(.[{£¥[{‵︴︵︷︹︻︽︿﹁﹃﹏〘⦅«" +); +/** + * The set of characters that must not be immediately followed by a line break + * + * e.g. `")"` + * + * - Bad: `"檜原村(ひのはらむら\n)以外には、"` + * - Good: `"檜原村(ひのはらむ\nら)以外には、"` or `"檜原村(ひのはらむら)\n以外には、"` + */ +const noBreakBeforeSymbolSet = new Set( + "!%),.:;?]}¢°·'\"†‡›℃∶、。〃〆〕〗〞﹚﹜!"%'),.:;?]}~–—•〉》」︰︱︲︳﹐﹑﹒﹓﹔﹕﹖﹘︶︸︺︼︾﹀﹂﹗|、』】〙〟⦆»ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻‐゠〜‼⁇⁈⁉・" +); +/** + * The set of characters whose surrounding newline may be converted to Space + * + * - ASCII punctuation marks + */ +const lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet = new Set( + "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" +); + +/** + * Unicode punctuation character defined in [CommonMark Spec](https://spec.commonmark.org/0.30/#unicode-punctuation-character) + */ +const punctuationsRegex = /(\p{Pc}|\p{Pd}|\p{Pe}|\p{Pf}|\p{Pi}|\p{Po}|\p{Ps})/u; function genericPrint(path, options, print) { const { node } = path; @@ -138,10 +176,7 @@ function genericPrint(path, options, print) { return escapedValue; } case "whitespace": { - const parentNode = path.getParentNode(); - const index = parentNode.children.indexOf(node); - const previous = parentNode.children[index - 1]; - const { next } = path; + const { next, previous } = path; const proseWrap = // leading char that may cause different syntax @@ -576,40 +611,12 @@ function isSentenceUseCJDividingSpace(path) { return sentenceNode.isCJSpacingUsing; } -/** - * Looks for 1st `:::` (MUST be followed by a Space) from 2nd `:::`. - * - * ``` - * ::: foo - * bar - * ::: - * ``` - * - * @param {*} path current position in nodes tree - * @param {string} mark what to look for - * @returns `true` if a `mark` followed by Space (U+0020) is found before `path`, `false` otherwise. - */ -function isCorrespondingMarkFollowedBySpaceBefore(path, mark) { - const sentenceNode = getAncestorNode(path, "sentence"); - if ( - sentenceNode.children - .slice(0, getPenultimate(path.stack)) - .some( - (node, i, array) => - node.value === mark && - array[i + 1]?.type === "whitespace" && - array[i + 1]?.value === " " - ) - ) { - return true; - } - return false; -} - /** * @typedef {import("./utils.js").TextNode} TextNode * @typedef {import("./utils.js").WhiteSpaceValue} WhiteSpaceValue - * @typedef {{next?: TextNode | undefined, previous?: TextNode | undefined}} AdjacentNodes + * @typedef {{next?: TextNode | undefined | null, previous?: TextNode | undefined | null}} AdjacentNodes + * @typedef {import("./utils.js").WordKind} WordKind + * @typedef {import("../common/ast-path").default} AstPath */ /** @@ -635,35 +642,49 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { if (typeof adjacentNodes !== "object") { return true; } - // "\n" between non-CJ (not han, kana, CJK punctuations) characters always can converted to Space - if ( - notCJLikeKind(adjacentNodes.previous?.kind) && - notCJLikeKind(adjacentNodes.next?.kind) - ) { + const previousKind = adjacentNodes.previous?.kind; + const nextKind = adjacentNodes.next?.kind; + // "\n" between not western or Korean (han, kana, CJK punctuations) characters always can converted to Space + // Korean hangul simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) + if (isWesternOrKorean(previousKind) && isWesternOrKorean(nextKind)) { return true; } // Do not convert it to Space when: if ( - // "\n" between CJ always SHALL NOT be converted to space - (adjacentNodes.previous?.kind === KIND_CJ_LETTER && - adjacentNodes.next?.kind === KIND_CJ_LETTER) || // Shall not be converted to Space around CJK punctuation - adjacentNodes.previous?.kind === KIND_CJK_PUNCTUATION || - adjacentNodes.next?.kind === KIND_CJK_PUNCTUATION + previousKind === KIND_CJK_PUNCTUATION || + nextKind === KIND_CJK_PUNCTUATION || + // "\n" between CJ always SHALL NOT be converted to space + // "\n" between Korean and CJ is better not to be converted to space + (isCJK(previousKind) && isCJK(nextKind)) ) { return false; } // The following rules do not precede the above rules (`return false`). // + // Cases: + // - CJ & non-CJK + // // 1. "\n" between special signs and CJ characters // [corresponding sign][Space][any string][CJ][[\n]][target sign] // we wonder if there are other marks to be considered. if ( - adjacentNodes.next.value === ":::" && - isCorrespondingMarkFollowedBySpaceBefore(path, adjacentNodes.next.value) + lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has( + adjacentNodes.next?.value + ) || + lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has( + adjacentNodes.previous?.value + ) ) { return true; } + // Converting newline between CJ and non-ASCII punctuation to Space does not seem to be better in many cases. (PR welcome) + if ( + punctuationsRegex.test(adjacentNodes.previous?.value) || + punctuationsRegex.test(adjacentNodes.next?.value) + ) { + return false; + } // 2. If sentence uses space between CJ and alphanumerics (including hangul because of backward-compatibility), // "\n" can be converted to Space. // Note: Koran uses space to divide words, so it is difficult to determine if "\n" should be converted to Space. @@ -671,11 +692,99 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { } /** - * @param {import("./utils.js").WordKind | undefined} kind - * @returns {boolean} `true` if `kind` is what is used in Chinese & Japanese (han, kana, and CJK punctuations) + * @param {WordKind | undefined} kind + * @returns {boolean} `true` if `kind` is CJK (including punctuation marks) + */ +function isCJK(kind) { + return kind !== undefined && kind !== KIND_NON_CJK; +} + +/** + * @param {WordKind | undefined} kind + * @returns {boolean} `true` if `kind` is western or Korean letters (divids words by Space) + */ +function isWesternOrKorean(kind) { + return kind !== undefined && kind !== KIND_NON_CJK; +} + +/** + * Get the last character + * + * Type for arrays is not provided because I don't know how it is compatible with string (PR welcome) + * + * @param {string} str + * @returns {string} last character */ -function notCJLikeKind(kind) { - return kind !== KIND_CJ_LETTER && kind !== KIND_CJK_PUNCTUATION; +function getLastCharacter(str) { + return str[str.length - 1]; +} + +/** + * Returns whether “whitespace” (`"" | " " | "\n"`; see `WhiteSpaceValue`) can converted to `"\n"` + * + * @param {*} path + * @param {WhiteSpaceValue} value + * @param {*} options + * @param {AdjacentNodes | undefined} [adjacentNodes] + * @returns {boolean} `true` if “whitespace” can be converted to `"\n"` + */ +function isBreakable(path, value, options, adjacentNodes) { + if (options.proseWrap !== "always") { + return false; + } + if (getAncestorNode(path, SINGLE_LINE_NODE_TYPES)) { + return false; + } + if (adjacentNodes == undefined) { + return true; + } + // Simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) + if ( + value === "" && + adjacentNodes.previous?.kind === KIND_K_LETTER && + adjacentNodes.next?.kind === KIND_K_LETTER + ) { + return false; + } + if (value === " ") { + return true; + } + // https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages + const isBreakingCJKLineBreakingRule = + (adjacentNodes.next?.value !== undefined && + noBreakBeforeSymbolSet.has(adjacentNodes.next.value[0])) || + (adjacentNodes.previous?.value !== undefined && + noBreakAfterSymbolSet.has( + getLastCharacter(adjacentNodes.previous.value) + )); + // For "" (CJK and some non-space) higher priority than the follwing rule + if (value === "" && isBreakingCJKLineBreakingRule) { + return false; + } + // Unusual newline between CJ and ASCII punctuation marks + // e.g. + // 中文日本語 + // ... + // Foobar + // ... + // + // common usage of `...`: + // 正在加载... 読込中... + if ( + value === "\n" && + ((adjacentNodes.previous?.kind === KIND_CJ_LETTER && + lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has( + adjacentNodes.next?.value[0] + )) || + (adjacentNodes.next?.kind === KIND_CJ_LETTER && + lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has( + adjacentNodes.previous?.value + ))) + ) { + return true; + } + // For "\n" lower priority (because the rule has already been broken) + return !isBreakingCJKLineBreakingRule; } /** @@ -689,9 +798,7 @@ function printLine(path, value, options, adjacentNodes) { return hardline; } - const isBreakable = - options.proseWrap === "always" && - !getAncestorNode(path, SINGLE_LINE_NODE_TYPES); + const isBreakable_ = isBreakable(path, value, options, adjacentNodes); // Space or empty if (value !== "\n") { @@ -707,7 +814,7 @@ function printLine(path, value, options, adjacentNodes) { * @param value {" " | ""} */ function convertToLineIfBreakable(value) { - if (!isBreakable) { + if (!isBreakable_) { return value; } return value === " " ? line : softline; From c6417faaef9ed201cab73b56ca59069d6017efed Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Fri, 23 Sep 2022 17:23:45 +0900 Subject: [PATCH 16/87] Remove `@ts-expect-error` --- src/language-markdown/utils.js | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/language-markdown/utils.js b/src/language-markdown/utils.js index 3278da466fe8..0c466960a88f 100644 --- a/src/language-markdown/utils.js +++ b/src/language-markdown/utils.js @@ -153,9 +153,7 @@ function splitText(text, options) { function isBetween(kind1, kind2) { return ( - // @ts-expect-error (lastNode.kind === kind1 && node.kind === kind2) || - // @ts-expect-error (lastNode.kind === kind2 && node.kind === kind1) ); } From bfcd89001b5166a1b817926ff530a019962de166 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Fri, 23 Sep 2022 17:34:13 +0900 Subject: [PATCH 17/87] Treat Space and newline as always breakable --- src/language-markdown/printer-markdown.js | 35 ++++------------------- 1 file changed, 6 insertions(+), 29 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 8944f252ce4e..837a5b27b65e 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -738,17 +738,17 @@ function isBreakable(path, value, options, adjacentNodes) { if (adjacentNodes == undefined) { return true; } + // Space & newline are always breakable + if (value !== "") { + return true; + } // Simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) if ( - value === "" && adjacentNodes.previous?.kind === KIND_K_LETTER && adjacentNodes.next?.kind === KIND_K_LETTER ) { return false; } - if (value === " ") { - return true; - } // https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages const isBreakingCJKLineBreakingRule = (adjacentNodes.next?.value !== undefined && @@ -758,33 +758,10 @@ function isBreakable(path, value, options, adjacentNodes) { getLastCharacter(adjacentNodes.previous.value) )); // For "" (CJK and some non-space) higher priority than the follwing rule - if (value === "" && isBreakingCJKLineBreakingRule) { + if (isBreakingCJKLineBreakingRule) { return false; } - // Unusual newline between CJ and ASCII punctuation marks - // e.g. - // 中文日本語 - // ... - // Foobar - // ... - // - // common usage of `...`: - // 正在加载... 読込中... - if ( - value === "\n" && - ((adjacentNodes.previous?.kind === KIND_CJ_LETTER && - lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has( - adjacentNodes.next?.value[0] - )) || - (adjacentNodes.next?.kind === KIND_CJ_LETTER && - lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has( - adjacentNodes.previous?.value - ))) - ) { - return true; - } - // For "\n" lower priority (because the rule has already been broken) - return !isBreakingCJKLineBreakingRule; + return true; } /** From 21a8dc7ec4726e111e03abaad7d98f5113352229 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Fri, 23 Sep 2022 17:38:53 +0900 Subject: [PATCH 18/87] Fix `isWesternOrKorean(Letter)` --- src/language-markdown/printer-markdown.js | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 837a5b27b65e..4b296fe5a61f 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -646,7 +646,10 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { const nextKind = adjacentNodes.next?.kind; // "\n" between not western or Korean (han, kana, CJK punctuations) characters always can converted to Space // Korean hangul simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) - if (isWesternOrKorean(previousKind) && isWesternOrKorean(nextKind)) { + if ( + isWesternOrKoreanLetter(previousKind) && + isWesternOrKoreanLetter(nextKind) + ) { return true; } // Do not convert it to Space when: @@ -703,8 +706,8 @@ function isCJK(kind) { * @param {WordKind | undefined} kind * @returns {boolean} `true` if `kind` is western or Korean letters (divids words by Space) */ -function isWesternOrKorean(kind) { - return kind !== undefined && kind !== KIND_NON_CJK; +function isWesternOrKoreanLetter(kind) { + return kind === KIND_NON_CJK || kind === KIND_K_LETTER; } /** From 7ae66730554a47bc495a9d001725d9e594bfff10 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 24 Sep 2022 00:15:20 +0900 Subject: [PATCH 19/87] Convert `\n` to Space if either of adjacent nodes are undefined --- src/language-markdown/printer-markdown.js | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 4b296fe5a61f..dc84fed381eb 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -642,6 +642,13 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { if (typeof adjacentNodes !== "object") { return true; } + // e.g. " \nletter" + if ( + typeof adjacentNodes.next === "undefined" || + typeof adjacentNodes.previous === "undefined" + ) { + return true; + } const previousKind = adjacentNodes.previous?.kind; const nextKind = adjacentNodes.next?.kind; // "\n" between not western or Korean (han, kana, CJK punctuations) characters always can converted to Space From 4f6eca4c81ff61275e44c3d198817b05834efaaa Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 24 Sep 2022 00:36:41 +0900 Subject: [PATCH 20/87] Use `String.prototype.at` --- src/language-markdown/printer-markdown.js | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index dc84fed381eb..2a22729587e0 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -717,18 +717,6 @@ function isWesternOrKoreanLetter(kind) { return kind === KIND_NON_CJK || kind === KIND_K_LETTER; } -/** - * Get the last character - * - * Type for arrays is not provided because I don't know how it is compatible with string (PR welcome) - * - * @param {string} str - * @returns {string} last character - */ -function getLastCharacter(str) { - return str[str.length - 1]; -} - /** * Returns whether “whitespace” (`"" | " " | "\n"`; see `WhiteSpaceValue`) can converted to `"\n"` * @@ -745,7 +733,7 @@ function isBreakable(path, value, options, adjacentNodes) { if (getAncestorNode(path, SINGLE_LINE_NODE_TYPES)) { return false; } - if (adjacentNodes == undefined) { + if (adjacentNodes === undefined) { return true; } // Space & newline are always breakable @@ -764,9 +752,7 @@ function isBreakable(path, value, options, adjacentNodes) { (adjacentNodes.next?.value !== undefined && noBreakBeforeSymbolSet.has(adjacentNodes.next.value[0])) || (adjacentNodes.previous?.value !== undefined && - noBreakAfterSymbolSet.has( - getLastCharacter(adjacentNodes.previous.value) - )); + noBreakAfterSymbolSet.has(adjacentNodes.previous.value.at(-1))); // For "" (CJK and some non-space) higher priority than the follwing rule if (isBreakingCJKLineBreakingRule) { return false; From 60e1eac6aecd1985599ee9a931d5629bb32011c9 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 24 Sep 2022 00:42:53 +0900 Subject: [PATCH 21/87] Use negative operator to check undefined or null --- src/language-markdown/printer-markdown.js | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 2a22729587e0..71c54a537e80 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -639,14 +639,13 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { return true; } // no adjacent nodes - if (typeof adjacentNodes !== "object") { + // Use `!` (no zero, NaN, or empty string) + if (!adjacentNodes) { return true; } // e.g. " \nletter" - if ( - typeof adjacentNodes.next === "undefined" || - typeof adjacentNodes.previous === "undefined" - ) { + // Use `!` (no zero, NaN, or empty string) + if (!adjacentNodes.previous || !adjacentNodes.next) { return true; } const previousKind = adjacentNodes.previous?.kind; From 613761369bb9ff9e2742503dff8525cfcae9abdc Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 24 Sep 2022 00:43:54 +0900 Subject: [PATCH 22/87] Don't capture group in regex --- src/language-markdown/printer-markdown.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 71c54a537e80..937f2c9663dc 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -91,7 +91,7 @@ const lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet = new Set( /** * Unicode punctuation character defined in [CommonMark Spec](https://spec.commonmark.org/0.30/#unicode-punctuation-character) */ -const punctuationsRegex = /(\p{Pc}|\p{Pd}|\p{Pe}|\p{Pf}|\p{Pi}|\p{Po}|\p{Ps})/u; +const punctuationsRegex = /(?:\p{Pc}|\p{Pd}|\p{Pe}|\p{Pf}|\p{Pi}|\p{Po}|\p{Ps})/u; function genericPrint(path, options, print) { const { node } = path; From 3d470001190a6f1930038f00ee6061eed13da9a2 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 24 Sep 2022 00:44:55 +0900 Subject: [PATCH 23/87] Remove unused import --- src/language-markdown/printer-markdown.js | 1 - 1 file changed, 1 deletion(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 937f2c9663dc..dad82006667f 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -37,7 +37,6 @@ import { INLINE_NODE_WRAPPER_TYPES, isAutolink, KIND_CJK_PUNCTUATION, - KIND_CJ_LETTER, KIND_NON_CJK, KIND_K_LETTER, } from "./utils.js"; From cb5ac2d8741e9f02cc9c75f6c5f1a1444a0e3ab5 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 24 Sep 2022 00:55:32 +0900 Subject: [PATCH 24/87] Add missing extraction of first & last characters of adjacent nodes --- src/language-markdown/printer-markdown.js | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index dad82006667f..ad83a851ecd2 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -90,7 +90,8 @@ const lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet = new Set( /** * Unicode punctuation character defined in [CommonMark Spec](https://spec.commonmark.org/0.30/#unicode-punctuation-character) */ -const punctuationsRegex = /(?:\p{Pc}|\p{Pd}|\p{Pe}|\p{Pf}|\p{Pi}|\p{Po}|\p{Ps})/u; +const punctuationsRegex = + /(?:\p{Pc}|\p{Pd}|\p{Pe}|\p{Pf}|\p{Pi}|\p{Po}|\p{Ps})/u; function genericPrint(path, options, print) { const { node } = path; @@ -668,6 +669,8 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { ) { return false; } + const previousLastChar = adjacentNodes.previous?.value?.at(-1); + const nextFirstChar = adjacentNodes.next?.value?.at(0); // The following rules do not precede the above rules (`return false`). // // Cases: @@ -677,19 +680,15 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { // [corresponding sign][Space][any string][CJ][[\n]][target sign] // we wonder if there are other marks to be considered. if ( - lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has( - adjacentNodes.next?.value - ) || - lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has( - adjacentNodes.previous?.value - ) + lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has(nextFirstChar) || + lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has(previousLastChar) ) { return true; } // Converting newline between CJ and non-ASCII punctuation to Space does not seem to be better in many cases. (PR welcome) if ( - punctuationsRegex.test(adjacentNodes.previous?.value) || - punctuationsRegex.test(adjacentNodes.next?.value) + punctuationsRegex.test(previousLastChar) || + punctuationsRegex.test(nextFirstChar) ) { return false; } From 4c0f9b9fc97fc29066430a8977f68465c798c5ad Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 24 Sep 2022 01:06:47 +0900 Subject: [PATCH 25/87] Fix typo --- src/language-markdown/printer-markdown.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index ad83a851ecd2..f971b2cc4ca9 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -708,7 +708,7 @@ function isCJK(kind) { /** * @param {WordKind | undefined} kind - * @returns {boolean} `true` if `kind` is western or Korean letters (divids words by Space) + * @returns {boolean} `true` if `kind` is western or Korean letters (divides words by Space) */ function isWesternOrKoreanLetter(kind) { return kind === KIND_NON_CJK || kind === KIND_K_LETTER; @@ -750,7 +750,7 @@ function isBreakable(path, value, options, adjacentNodes) { noBreakBeforeSymbolSet.has(adjacentNodes.next.value[0])) || (adjacentNodes.previous?.value !== undefined && noBreakAfterSymbolSet.has(adjacentNodes.previous.value.at(-1))); - // For "" (CJK and some non-space) higher priority than the follwing rule + // For "" (CJK and some non-space) higher priority than the following rule if (isBreakingCJKLineBreakingRule) { return false; } From 8039594cff735cefddc82b37cfda17f3cc28cd65 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 24 Sep 2022 10:11:17 +0900 Subject: [PATCH 26/87] Update snapshot --- .../__snapshots__/jsfmt.spec.js.snap | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap b/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap index 82f3467d9cb6..3184adea7657 100644 --- a/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap +++ b/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap @@ -64,15 +64,16 @@ English混合著中文的一段Paragraph!這是一個English混合著中文的 一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph! 這是一個English混合著中文的一段Paragraph! -Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ -!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ -!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ -!Englishと日本語が混ざったParagraphだよ! - -MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー -!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー -!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー -!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー! +Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだ +よ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだ +よ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだ +よ!Englishと日本語が混ざったParagraphだよ! + +MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイ +コー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイ +コー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイ +コー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイ +コー! Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在 2016年11月29日。 From b38c3b96293d85bad77b3fd747e9b28cef7f7993 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 24 Sep 2022 10:29:39 +0900 Subject: [PATCH 27/87] Fix handling undefined --- src/language-markdown/printer-markdown.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index f971b2cc4ca9..ba9b935924c9 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -747,9 +747,9 @@ function isBreakable(path, value, options, adjacentNodes) { // https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages const isBreakingCJKLineBreakingRule = (adjacentNodes.next?.value !== undefined && - noBreakBeforeSymbolSet.has(adjacentNodes.next.value[0])) || + noBreakBeforeSymbolSet.has(adjacentNodes.next?.value?.at(0))) || (adjacentNodes.previous?.value !== undefined && - noBreakAfterSymbolSet.has(adjacentNodes.previous.value.at(-1))); + noBreakAfterSymbolSet.has(adjacentNodes.previous?.value?.at(-1))); // For "" (CJK and some non-space) higher priority than the following rule if (isBreakingCJKLineBreakingRule) { return false; From 7b8023436d8ebf6308164bcc3faa83d2e6915ef7 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Wed, 28 Sep 2022 23:20:44 +0900 Subject: [PATCH 28/87] Update Chinese & Japanese Markdown testcases --- .../__snapshots__/jsfmt.spec.js.snap | 125 ++++++++++++++++-- .../markdown/splitCjkText/han-kana-alnum.md | 6 +- .../splitCjkText/symbolSpaceNewLine.md | 50 +++++++ 3 files changed, 164 insertions(+), 17 deletions(-) create mode 100644 tests/format/markdown/splitCjkText/symbolSpaceNewLine.md diff --git a/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap b/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap index 3184adea7657..8465546c49e6 100644 --- a/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap +++ b/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap @@ -47,11 +47,11 @@ proseWrap: "always" =====================================input====================================== 這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph! -Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ! +Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする? -MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー! +MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー! -Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。 +Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。 =====================================output===================================== 這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段 @@ -64,19 +64,40 @@ English混合著中文的一段Paragraph!這是一個English混合著中文的 一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph! 這是一個English混合著中文的一段Paragraph! -Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだ -よ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだ -よ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだ -よ!Englishと日本語が混ざったParagraphだよ! - -MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイ -コー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイ -コー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイ -コー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイ -コー! +Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語 +が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざった +Paragraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れ +た!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはど +うする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?English +と日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざっ +たParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現 +れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierは +どうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうす +る?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日 +本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざった +Paragraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れ +た!Prettierはどうする? + +MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイ +コー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierは +サイコー!MarkdownフォーマッターPrettierはサイコー!Markdownフォーマッター +Prettierはサイコー!MarkdownフォーマッターPrettierはサイコー!Markdownフォーマッ +ターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!Markdownフォー +マッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!Markdown +フォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイ +コー!MarkdownフォーマッターPrettierはサイコー! Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在 -2016年11月29日。 +2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的 +第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われま +した。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11 +月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初の +Commitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29 +日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit +是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われまし +た。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月 +29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初の +Commitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。 ================================================================================ `; @@ -157,3 +178,79 @@ English 混合著中文的一段 Paragraph! ================================================================================ `; + +exports[`symbolSpaceNewLine.md - {"proseWrap":"always"} format 1`] = ` +====================================options===================================== +parsers: ["markdown"] +printWidth: 80 +proseWrap: "always" + | printWidth +=====================================input====================================== +日本語 +、 +にほんご +。 +汉语, +中文. +日 +本 +語 +, +に +ほ +ん +ご +. +English +words!? +漢字 +! +汉字 +? +「セリフ」 +(括弧) +文字 +(括弧) +文字 +【括弧】 +日本語 +English + +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)」 +「禁則(きんそく)処理(しょり)」 +「禁則(きんそく)処理(しょり)!!!!」 +「禁則(きんそく)処理(しょり)!!!」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 + +中点 +・ +中点 + +=====================================output===================================== +日本語、にほんご。汉语, 中文. 日本語,にほんご.English words!? 漢字!汉字?「セ +リフ」(括弧) 文字(括弧)文字【括弧】日本語English + +「禁則(きんそく)処理(しょり)!」「禁則(きんそく)処理(しょり)!」「禁則 +(きんそく)処理(しょり)!」「禁則(きんそく)処理(しょり)」「禁則(きんそ +く)処理(しょり)!」「禁則(きんそく)処理(しょり)!」「禁則(きんそく)処理 +(しょり)」「禁則(きんそく)処理(しょり)」「禁則(きんそく)処理(しょ +り)!!!!」「禁則(きんそく)処理(しょり)!!!」「禁則(きんそく)処理 +(しょり)!」「禁則(きんそく)処理(しょり)!」「禁則(きんそく)処理(しょ +り)!」「禁則(きんそく)処理(しょり)!」「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 + +中点・中点 + +================================================================================ +`; diff --git a/tests/format/markdown/splitCjkText/han-kana-alnum.md b/tests/format/markdown/splitCjkText/han-kana-alnum.md index b7a8e4fd7af0..55e90bc7237e 100644 --- a/tests/format/markdown/splitCjkText/han-kana-alnum.md +++ b/tests/format/markdown/splitCjkText/han-kana-alnum.md @@ -1,7 +1,7 @@ 這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph!這是一個English混合著中文的一段Paragraph! -Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ!Englishと日本語が混ざったParagraphだよ! +Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする?Englishと日本語が混ざったParagraphが現れた!Prettierはどうする? -MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー!MarkdownフォーマッタPrettierサイコー! +MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー!MarkdownフォーマッターPrettierはサイコー! -Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。 +Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。Prettierの最初のCommitは2016年11月29日に行われました。Prettier的第一次Commit是在2016年11月29日。 diff --git a/tests/format/markdown/splitCjkText/symbolSpaceNewLine.md b/tests/format/markdown/splitCjkText/symbolSpaceNewLine.md new file mode 100644 index 000000000000..f6e277e7cf82 --- /dev/null +++ b/tests/format/markdown/splitCjkText/symbolSpaceNewLine.md @@ -0,0 +1,50 @@ +日本語 +、 +にほんご +。 +汉语, +中文. +日 +本 +語 +, +に +ほ +ん +ご +. +English +words!? +漢字 +! +汉字 +? +「セリフ」 +(括弧) +文字 +(括弧) +文字 +【括弧】 +日本語 +English + +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)」 +「禁則(きんそく)処理(しょり)」 +「禁則(きんそく)処理(しょり)!!!!」 +「禁則(きんそく)処理(しょり)!!!」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 +「禁則(きんそく)処理(しょり)!」 + +中点 +・ +中点 From 3a0f03b0bc020b702858b50b2215fc83da89b43b Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Wed, 28 Sep 2022 23:34:14 +0900 Subject: [PATCH 29/87] Fix AstPath import & type hinting in JSDoc --- src/language-markdown/printer-markdown.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index ba9b935924c9..282ed9520075 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -616,7 +616,7 @@ function isSentenceUseCJDividingSpace(path) { * @typedef {import("./utils.js").WhiteSpaceValue} WhiteSpaceValue * @typedef {{next?: TextNode | undefined | null, previous?: TextNode | undefined | null}} AdjacentNodes * @typedef {import("./utils.js").WordKind} WordKind - * @typedef {import("../common/ast-path").default} AstPath + * @typedef {import("../common/ast-path.js").default} AstPath */ /** @@ -628,7 +628,7 @@ function isSentenceUseCJDividingSpace(path) { * However, you should note that Chinese and Japanese does not use U+0020 Space to divide words, so U+00A0 No-break space must not be replaced with it. * Behavior in other languages (e.g. Thai) will not be changed because there are too much things to consider. (PR welcome) * - * @param {*} path path of given node + * @param {AstPath} path path of given node * @param {WhiteSpaceValue} value value of given node (typically `" "` or `"\n"`) * @param {AdjacentNodes | undefined} adjacentNodes adjacent sibling nodes of given node * @returns {boolean} `true` if given node can be converted to space, `false` if not (i.e. newline or empty character) @@ -717,7 +717,7 @@ function isWesternOrKoreanLetter(kind) { /** * Returns whether “whitespace” (`"" | " " | "\n"`; see `WhiteSpaceValue`) can converted to `"\n"` * - * @param {*} path + * @param {AstPath} path * @param {WhiteSpaceValue} value * @param {*} options * @param {AdjacentNodes | undefined} [adjacentNodes] @@ -758,7 +758,7 @@ function isBreakable(path, value, options, adjacentNodes) { } /** - * @param {*} path + * @param {AstPath} path * @param {WhiteSpaceValue} value * @param {*} options * @param {AdjacentNodes | undefined} [adjacentNodes] From 97a14cc893b81f283cdd225f5359fa1cf0f7f561 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Fri, 30 Sep 2022 23:10:16 +0900 Subject: [PATCH 30/87] Improve no break symbol set --- src/language-markdown/printer-markdown.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 282ed9520075..1394300bf3aa 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -65,7 +65,7 @@ const SIBLING_NODE_TYPES = new Set([ * - Good: `"檜原村\n(ひのはらむら)"` or ``"檜原村(ひ\nのはらむら)"` */ const noBreakAfterSymbolSet = new Set( - "$(£¥·'\"〈《「『【〔〖〝﹙﹛$(.[{£¥[{‵︴︵︷︹︻︽︿﹁﹃﹏〘⦅«" + "$(£¥·'\"〈《「『【〔〖〝﹙﹛$([{£¥[{‵︴︵︷︹︻︽︿﹁﹃﹏〘⦅«" ); /** * The set of characters that must not be immediately followed by a line break @@ -76,7 +76,7 @@ const noBreakAfterSymbolSet = new Set( * - Good: `"檜原村(ひのはらむ\nら)以外には、"` or `"檜原村(ひのはらむら)\n以外には、"` */ const noBreakBeforeSymbolSet = new Set( - "!%),.:;?]}¢°·'\"†‡›℃∶、。〃〆〕〗〞﹚﹜!"%'),.:;?]}~–—•〉》」︰︱︲︳﹐﹑﹒﹓﹔﹕﹖﹘︶︸︺︼︾﹀﹂﹗|、』】〙〟⦆»ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻‐゠〜‼⁇⁈⁉・" + "!%),.:;?]}¢°·'\"†‡›℃∶、。〃〆〕〗〞﹚﹜!"%'),.:;?]}~–—•〉》」︰︱︲︳﹐﹑﹒﹓﹔﹕﹖﹘︶︸︺︼︾﹀﹂﹗|、』】〙〟⦆»ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻‐゠〜~‼⁇⁈⁉・" ); /** * The set of characters whose surrounding newline may be converted to Space From 228fa1bc3c4d6b3759abe4485f6315e8639135e2 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 1 Oct 2022 14:23:54 +0900 Subject: [PATCH 31/87] Fix `unicorn/no-lonely-if` --- src/language-markdown/utils.js | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/language-markdown/utils.js b/src/language-markdown/utils.js index 0c466960a88f..52e914fdf808 100644 --- a/src/language-markdown/utils.js +++ b/src/language-markdown/utils.js @@ -140,14 +140,14 @@ function splitText(text, options) { function appendNode(node) { const lastNode = nodes.at(-1); - if (lastNode && lastNode.type === "word") { - if ( - !isBetween(KIND_NON_CJK, KIND_CJK_PUNCTUATION) && - // disallow leading/trailing full-width whitespace - ![lastNode.value, node.value].some((value) => /\u3000/.test(value)) - ) { - nodes.push({ type: "whitespace", value: "" }); - } + if ( + lastNode && + lastNode.type === "word" && + !isBetween(KIND_NON_CJK, KIND_CJK_PUNCTUATION) && + // disallow leading/trailing full-width whitespace + ![lastNode.value, node.value].some((value) => /\u3000/.test(value)) + ) { + nodes.push({ type: "whitespace", value: "" }); } nodes.push(node); From 5a94e3623d75d690e1fcfe26e54ee2cde2aaa3c8 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 1 Oct 2022 16:38:32 +0900 Subject: [PATCH 32/87] Convert newline surrounded by Korean to Space --- src/language-markdown/printer-markdown.js | 7 ++- src/language-markdown/utils.js | 12 ++--- .../__snapshots__/jsfmt.spec.js.snap | 54 +++++++++++++++++++ tests/format/markdown/splitCjkText/korean.md | 37 +++++++++++++ 4 files changed, 99 insertions(+), 11 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 1394300bf3aa..924cfb0aae5a 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -738,9 +738,12 @@ function isBreakable(path, value, options, adjacentNodes) { return true; } // Simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) + // [Latin][""][Hangul] & vice versa => Don't break if ( - adjacentNodes.previous?.kind === KIND_K_LETTER && - adjacentNodes.next?.kind === KIND_K_LETTER + (adjacentNodes.previous?.kind === KIND_K_LETTER && + isWesternOrKoreanLetter(adjacentNodes.next?.kind)) || + (adjacentNodes.next?.kind === KIND_K_LETTER && + isWesternOrKoreanLetter(adjacentNodes.previous?.kind)) ) { return false; } diff --git a/src/language-markdown/utils.js b/src/language-markdown/utils.js index 52e914fdf808..0358ff6e75b9 100644 --- a/src/language-markdown/utils.js +++ b/src/language-markdown/utils.js @@ -63,18 +63,12 @@ const KIND_CJK_PUNCTUATION = "cjk-punctuation"; * split text into whitespaces and words * @param {string} text */ -function splitText(text, options) { +// eslint-disable-next-line no-unused-vars +function splitText(text, _options) { /** @type {Array} */ const nodes = []; - const tokens = ( - options.proseWrap === "preserve" - ? text - : text.replace( - new RegExp(`(${cjkPattern})\n(${cjkPattern})`, "g"), - "$1$2" - ) - ).split(/([\t\n ]+)/); + const tokens = text.split(/([\t\n ]+)/); for (const [index, token] of tokens.entries()) { // whitespace if (index % 2 === 1) { diff --git a/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap b/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap index 8465546c49e6..6cf5f15c0f3f 100644 --- a/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap +++ b/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap @@ -111,9 +111,63 @@ proseWrap: "always" =====================================input====================================== 예문Latin예문Latin 예문Latin예문 Latin예문Latin 예문 Latin 예문 +한국어와 English를 섞어 보았습니다. 한국어와 English를 섞어 보았습니다. +한국어와 +English를 +섞어 +보았습니다. +한국어와 +English를 +섞어 +보았습니다. +한국어와 English를 섞어 보았습니다. 한국어와 English를 섞어 보았습니다. +한국어와 +English를 +섞어 +보았습니다. +한국어와 +English를 +섞어 +보았습니다. +한국어와 English를 섞어 보았습니다. 한국어와 English를 섞어 보았습니다. + +NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. +NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. +NVIDIA의 +RTX3000 +시리즈는 +삼성전자와 +TSMC에 +의해 +제조된다. +NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. +NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. + +대한민국(듣기 (도움말·정보), 大韓民國, 영어: Republic of Korea; ROK[3])은 동아시아의 한반도 중남부에 있는 공화국이다. + +서쪽으로는 서해를 사이에 두고 중화인민공화국이, 동쪽으로는 동해를 사이에 두고 일본이 있으며 +북쪽으로는 조선민주주의인민공화국과 맞닿아 있다. + =====================================output===================================== 예문Latin예문Latin 예문Latin예문 Latin예문Latin 예문 Latin 예문 +한국어와 English를 섞어 보았습니다. 한국어와 English를 섞어 보았습니다. 한국어와 +English를 섞어 보았습니다. 한국어와 English를 섞어 보았습니다. 한국어와 +English를 섞어 보았습니다. 한국어와 English를 섞어 보았습니다. 한국어와 +English를 섞어 보았습니다. 한국어와 English를 섞어 보았습니다. 한국어와 +English를 섞어 보았습니다. 한국어와 English를 섞어 보았습니다. + +NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. NVIDIA의 RTX3000 +시리즈는 삼성전자와 TSMC에 의해 제조된다. NVIDIA의 RTX3000 시리즈는 삼성전자와 +TSMC에 의해 제조된다. NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. +NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. + +대한민국(듣기 (도움말·정보), 大韓民國, 영어: Republic of Korea; ROK[3])은 +동아시아의 한반도 중남부에 있는 공화국이다. + +서쪽으로는 서해를 사이에 두고 중화인민공화국이, 동쪽으로는 동해를 사이에 두고 +일본이 있으며 북쪽으로는 조선민주주의인민공화국과 맞닿아 있다. + ================================================================================ `; diff --git a/tests/format/markdown/splitCjkText/korean.md b/tests/format/markdown/splitCjkText/korean.md index 2bd2c9797af0..196318a90736 100644 --- a/tests/format/markdown/splitCjkText/korean.md +++ b/tests/format/markdown/splitCjkText/korean.md @@ -1 +1,38 @@ 예문Latin예문Latin 예문Latin예문 Latin예문Latin 예문 Latin 예문 + +한국어와 English를 섞어 보았습니다. 한국어와 English를 섞어 보았습니다. +한국어와 +English를 +섞어 +보았습니다. +한국어와 +English를 +섞어 +보았습니다. +한국어와 English를 섞어 보았습니다. 한국어와 English를 섞어 보았습니다. +한국어와 +English를 +섞어 +보았습니다. +한국어와 +English를 +섞어 +보았습니다. +한국어와 English를 섞어 보았습니다. 한국어와 English를 섞어 보았습니다. + +NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. +NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. +NVIDIA의 +RTX3000 +시리즈는 +삼성전자와 +TSMC에 +의해 +제조된다. +NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. +NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. + +대한민국(듣기 (도움말·정보), 大韓民國, 영어: Republic of Korea; ROK[3])은 동아시아의 한반도 중남부에 있는 공화국이다. + +서쪽으로는 서해를 사이에 두고 중화인민공화국이, 동쪽으로는 동해를 사이에 두고 일본이 있으며 +북쪽으로는 조선민주주의인민공화국과 맞닿아 있다. From a0e28fbc4adda68a33648d46dc9a095a2a31b5da Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 1 Oct 2022 23:13:59 +0900 Subject: [PATCH 33/87] Improve whitespace between Korean & Chinese --- src/language-markdown/printer-markdown.js | 21 ++++++++++++-- .../__snapshots__/jsfmt.spec.js.snap | 28 +++++++++++++++++++ tests/format/markdown/splitCjkText/korean.md | 22 +++++++++++++++ 3 files changed, 69 insertions(+), 2 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 924cfb0aae5a..7f041afc2ae2 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -39,6 +39,7 @@ import { KIND_CJK_PUNCTUATION, KIND_NON_CJK, KIND_K_LETTER, + KIND_CJ_LETTER, } from "./utils.js"; import visitorKeys from "./visitor-keys.js"; @@ -658,6 +659,13 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { ) { return true; } + // han & hangul: same way preferred + if ( + (previousKind === KIND_K_LETTER && nextKind === KIND_CJ_LETTER) || + (nextKind === KIND_K_LETTER && previousKind === KIND_CJ_LETTER) + ) { + return true; + } // Do not convert it to Space when: if ( // Shall not be converted to Space around CJK punctuation @@ -706,6 +714,14 @@ function isCJK(kind) { return kind !== undefined && kind !== KIND_NON_CJK; } +/** + * @param {WordKind | undefined} kind + * @returns {boolean} `true` if `kind` is letter (not CJK punctuation) + */ +function isLetter(kind) { + return kind !== undefined && kind !== KIND_CJK_PUNCTUATION; +} + /** * @param {WordKind | undefined} kind * @returns {boolean} `true` if `kind` is western or Korean letters (divides words by Space) @@ -739,11 +755,12 @@ function isBreakable(path, value, options, adjacentNodes) { } // Simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) // [Latin][""][Hangul] & vice versa => Don't break + // [han & kana][""][hangul], either if ( (adjacentNodes.previous?.kind === KIND_K_LETTER && - isWesternOrKoreanLetter(adjacentNodes.next?.kind)) || + isLetter(adjacentNodes.next?.kind)) || (adjacentNodes.next?.kind === KIND_K_LETTER && - isWesternOrKoreanLetter(adjacentNodes.previous?.kind)) + isLetter(adjacentNodes.previous?.kind)) ) { return false; } diff --git a/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap b/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap index 6cf5f15c0f3f..e3a21ef5b902 100644 --- a/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap +++ b/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap @@ -148,6 +148,28 @@ NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. 서쪽으로는 서해를 사이에 두고 중화인민공화국이, 동쪽으로는 동해를 사이에 두고 일본이 있으며 북쪽으로는 조선민주주의인민공화국과 맞닿아 있다. +역대 대통령은 李承晩 과 許政 과 尹潽善 과 朴正熙 과 崔圭夏 과 朴忠勳 과 全斗煥 과 盧泰愚 과 金泳三 과 金大中 과 盧武鉉 과 李明博 과 朴槿惠 과 文在寅 과 尹錫悅 과 음... +역대 대통령은 +李承晩 +과 +許政 +과 +尹潽善 +과 +朴正熙 과 +崔圭夏 과 +朴忠勳 과 +全斗煥 과 +盧泰愚 과 +金泳三 +과 金大中 +과 盧武鉉 +과 李明博 +과 朴槿惠 +과 文在寅 +과 尹錫悅 과 +음... + =====================================output===================================== 예문Latin예문Latin 예문Latin예문 Latin예문Latin 예문 Latin 예문 @@ -168,6 +190,12 @@ NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. 서쪽으로는 서해를 사이에 두고 중화인민공화국이, 동쪽으로는 동해를 사이에 두고 일본이 있으며 북쪽으로는 조선민주주의인민공화국과 맞닿아 있다. +역대 대통령은 李承晩 과 許政 과 尹潽善 과 朴正熙 과 崔圭夏 과 朴忠勳 과 全斗煥 +과 盧泰愚 과 金泳三 과 金大中 과 盧武鉉 과 李明博 과 朴槿惠 과 文在寅 과 尹錫悅 +과 음... 역대 대통령은 李承晩 과 許政 과 尹潽善 과 朴正熙 과 崔圭夏 과 朴忠勳 과 +全斗煥 과 盧泰愚 과 金泳三 과 金大中 과 盧武鉉 과 李明博 과 朴槿惠 과 文在寅 과 +尹錫悅 과 음... + ================================================================================ `; diff --git a/tests/format/markdown/splitCjkText/korean.md b/tests/format/markdown/splitCjkText/korean.md index 196318a90736..4713c34243f7 100644 --- a/tests/format/markdown/splitCjkText/korean.md +++ b/tests/format/markdown/splitCjkText/korean.md @@ -36,3 +36,25 @@ NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. 서쪽으로는 서해를 사이에 두고 중화인민공화국이, 동쪽으로는 동해를 사이에 두고 일본이 있으며 북쪽으로는 조선민주주의인민공화국과 맞닿아 있다. + +역대 대통령은 李承晩 과 許政 과 尹潽善 과 朴正熙 과 崔圭夏 과 朴忠勳 과 全斗煥 과 盧泰愚 과 金泳三 과 金大中 과 盧武鉉 과 李明博 과 朴槿惠 과 文在寅 과 尹錫悅 과 음... +역대 대통령은 +李承晩 +과 +許政 +과 +尹潽善 +과 +朴正熙 과 +崔圭夏 과 +朴忠勳 과 +全斗煥 과 +盧泰愚 과 +金泳三 +과 金大中 +과 盧武鉉 +과 李明博 +과 朴槿惠 +과 文在寅 +과 尹錫悅 과 +음... From a005b32d4370680d0e4a2b3a458ae4e0718ae83e Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 2 Oct 2022 23:37:16 +0900 Subject: [PATCH 34/87] Update patch note --- changelog_unreleased/markdown/11597.md | 81 +++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/changelog_unreleased/markdown/11597.md b/changelog_unreleased/markdown/11597.md index 78684efc97ea..5b9bb6acfbf4 100644 --- a/changelog_unreleased/markdown/11597.md +++ b/changelog_unreleased/markdown/11597.md @@ -1,4 +1,4 @@ -#### No more inserting space between Chinese or Japanese (e.g. hanzi and kana) and western characters (#11597 by @tats-u) +#### [BREAKING] No more inserting space between Chinese or Japanese (e.g. hanzi and kana) and western characters (#11597 by @tats-u) The current behavior of inserting whitespace (U+0020) between Chinese or Japanese (e.g. hanzi/kanji and kana) and western (e.g. alphanumerics) characters is not based on the official layout guidelines in Japanese and Chinese but [non-standard and local one in Chinese](https://github.com/ruanyf/document-style-guide/blob/master/docs/text.md). @@ -74,3 +74,82 @@ For these reasons, we stopped inserting space. 漢字 Alphabets ひらがな 12345 カタカナ 67890 한글 ``` + +This change includes the following breaking changes: + +##### 1. Comply to line breaking rules in Chinese and Japanese + +In Chinese and Japanese, some characters (e.g. full stop characters `。`, `.`, and `.`) are not allowed to be placed the head or tail of lines. + +- Japanese: +- Chinese: + +👎Bad: + +```text +日本語組版には +、禁則処理( +きんそくしょり +)と呼ばれるル +ールがあります +。 +``` + +👍Good: + +```text +日本語組版に +は、禁則処理 +(きんそくしょ +り)と呼ばれる +ルールがありま +す。 +``` + +In the above sentence: + +- `、`, `)`, `ー`, `ょ`, and `。` must not be placed in the head of lines. +- `(` must not be placed in the tail of lines. + +##### 2. Do not break lines inside Korean words + +This fixes #6516. Korean uses space to divide words or clauses, and inappropriate words (or clauses) divisions may change the meanings of sentences. + +- `노래를 못해요.`: I'm not good at singing. +- `노래를 못 해요.`: I can't sing (for some reason). + +Applying this patch, when `proseWrap` is other than `preserve`, Korean sentences are now formatted like those of European languages; successive (i.e. not divided by space) Hangul characters are no longer divided by the newline (U+000A), which may be converted to the space (U+0020) when the content of the document is modified. + +`proseWrap` = `always` & `printWidth` = `9`: + +Input: + +```text +노래를 못해요. +``` + +Before this patch: + +```text +노래를 못 +해요. +``` + +👎Reformat “Before this patch” with `printWidth` = `80`: + +```text +노래를 못 해요. +``` + +After this patch: + +```text +노래를 +못해요. +``` + +👍Reformat “After this patch” with `printWidth` = `80`: + +```text +노래를 못해요. +``` From 19b5304fb4756fe33f4f4d9209e2cfc022b1c289 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Fri, 7 Oct 2022 22:57:45 +0900 Subject: [PATCH 35/87] Improve patch note --- changelog_unreleased/markdown/11597.md | 59 +++----------------------- 1 file changed, 5 insertions(+), 54 deletions(-) diff --git a/changelog_unreleased/markdown/11597.md b/changelog_unreleased/markdown/11597.md index 5b9bb6acfbf4..282addbedf88 100644 --- a/changelog_unreleased/markdown/11597.md +++ b/changelog_unreleased/markdown/11597.md @@ -1,61 +1,10 @@ -#### [BREAKING] No more inserting space between Chinese or Japanese (e.g. hanzi and kana) and western characters (#11597 by @tats-u) +#### No more inserting space between Chinese or Japanese (e.g. hanzi and kana) and western characters (#11597 by @tats-u) The current behavior of inserting whitespace (U+0020) between Chinese or Japanese (e.g. hanzi/kanji and kana) and western (e.g. alphanumerics) characters is not based on the official layout guidelines in Japanese and Chinese but [non-standard and local one in Chinese](https://github.com/ruanyf/document-style-guide/blob/master/docs/text.md). -Official Japanese guideline (W3C): +It is not the only or the official rule, so Prettier now does not force it. If the document does not apply the rule, Prettier does follow the policy. -> 3.9.1 Differences in Positioning of Characters and Symbols -> -> The positioning of characters and symbols may vary depending on the following. -> -> d. Are characters and symbols appearing in sequence in solid setting, or will there be a fixed size space between them? For example, sequences of ideographic characters (cl-19) and hiragana (cl-15) are set solid, and for Western characters (cl-27) following hiragana (cl-15) there will be quarter em spacing. - - - -> “one quarter em” means one quarter of the full-width size. (JIS Z 8125) -> “one quarter em space” means amount of space that is one quarter size of em space. - - - - -Official Japanese guideline (JIS X 4051:2004): - -> 4.7 和欧文混植処理 -> -> a) 横書きでは,和文と欧文との間の空き量は,四分アキを原則とする。 -> -> 4.7 Mixed Japanese and Western Text Composition -> -> a) In horizontal writing, the space between Japanese and western text should be one quarter em, as a rule. -> -> PR Author's Note: Original text is written only in Japanese and translation is based on [DeepL](https://www.deepl.com/translator). - - (Japanese) - -Official Chinese guideline (W3C): - -> 3.2.2 Mixed Text Composition in Horizontal Writing Mode -> -> In principle, there is tracking or spacing between an adjacent Han character and a Western character of up to one quarter of a Han character width, except at the line start or end. -> -> NOTE: Another approach is to use a Western word space (U+0020 SPACE), in which case the width depends on the font in use. - - - -As mentioned above, whitespace (U+0020) is allowed to be substituted for one quarter em only in Chinese, although they have a similar appearance. Also, even in Chinese, the rule is not adopted even in the W3C guideline page but is mentioned as just one of the options. - -Some renderers (e.g. convert to PDF using Pandoc with the backend of LaTeX) can automatically insert genuine one quarter em. The width of whitespace is different from one quarter em, so inserting whitespace (U+0020) takes away the option to leave it to renderers to insert one quarter em. Adding space should be left to renderers and should not be done by Prettier, just a formatter. - - -Adding whitespace may interfere with searches for text containing both Chinese or Japanese and western characters. For example, you cannot find “第1章” (Chapter 1) in a Markdown document or its derivative just by searching by the string “第1章” but “第 1 章”. - -To make matters worst, once whitespace is inserted, it is difficult to remove it. The following sentence cannot be said to be wrong. - -> 作る means make in Japanese. - -The too simple rule of removing whitespace between Chinese or Japanese characters and alphanumerics removes that between “作る” and “means” unless you modify the sentence, that is, quote “作る”. It is so difficult to create a common rule that can safely remove whitespace from all documents and deserves to be included in Prettier. - -For these reasons, we stopped inserting space. +For more information of the reason, pleasa see , which this patch is based on. ```markdown @@ -111,6 +60,8 @@ In the above sentence: - `、`, `)`, `ー`, `ょ`, and `。` must not be placed in the head of lines. - `(` must not be placed in the tail of lines. +Prettier now follows the rule when `proseWrap` is other than `presreve`. + ##### 2. Do not break lines inside Korean words This fixes #6516. Korean uses space to divide words or clauses, and inappropriate words (or clauses) divisions may change the meanings of sentences. From 42579487c5666ec4ffc64db4357ccc690e849524 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Fri, 7 Oct 2022 23:23:49 +0900 Subject: [PATCH 36/87] Fix comments and symbol names --- src/language-markdown/printer-markdown.js | 44 +++++++++++------------ src/language-markdown/utils.js | 4 +-- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 7f041afc2ae2..6bc63b5da803 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -65,18 +65,18 @@ const SIBLING_NODE_TYPES = new Set([ * - Bad: `"檜原村(\nひのはらむら)"` * - Good: `"檜原村\n(ひのはらむら)"` or ``"檜原村(ひ\nのはらむら)"` */ -const noBreakAfterSymbolSet = new Set( +const noBreakAfterCharacterSet = new Set( "$(£¥·'\"〈《「『【〔〖〝﹙﹛$([{£¥[{‵︴︵︷︹︻︽︿﹁﹃﹏〘⦅«" ); /** - * The set of characters that must not be immediately followed by a line break + * The set of characters that must not immediately follow a line break * * e.g. `")"` * * - Bad: `"檜原村(ひのはらむら\n)以外には、"` * - Good: `"檜原村(ひのはらむ\nら)以外には、"` or `"檜原村(ひのはらむら)\n以外には、"` */ -const noBreakBeforeSymbolSet = new Set( +const noBreakBeforeCharacterSet = new Set( "!%),.:;?]}¢°·'\"†‡›℃∶、。〃〆〕〗〞﹚﹜!"%'),.:;?]}~–—•〉》」︰︱︲︳﹐﹑﹒﹓﹔﹕﹖﹘︶︸︺︼︾﹀﹂﹗|、』】〙〟⦆»ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻‐゠〜~‼⁇⁈⁉・" ); /** @@ -91,7 +91,7 @@ const lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet = new Set( /** * Unicode punctuation character defined in [CommonMark Spec](https://spec.commonmark.org/0.30/#unicode-punctuation-character) */ -const punctuationsRegex = +const punctuationRegex = /(?:\p{Pc}|\p{Pd}|\p{Pe}|\p{Pf}|\p{Pi}|\p{Po}|\p{Ps})/u; function genericPrint(path, options, print) { @@ -574,10 +574,10 @@ function getAncestorNode(path, typeOrTypes) { * @param {*} path current position in nodes tree * @returns {boolean} `true` if Space is tend to be inserted between these types of letters, `false` otherwise. */ -function isSentenceUseCJDividingSpace(path) { +function isInSentenceWithCJSpaces(path) { const sentenceNode = getAncestorNode(path, "sentence"); - if (sentenceNode.isCJSpacingUsing !== undefined) { - return sentenceNode.isCJSpacingUsing; + if (sentenceNode.usesCJSpaces !== undefined) { + return sentenceNode.usesCJSpaces; } const cjNonCJKSpacingStatistics = { @@ -607,14 +607,14 @@ function isSentenceUseCJDividingSpace(path) { } } // Injects a property to cache the result. - sentenceNode.isCJSpacingUsing = + sentenceNode.usesCJSpaces = cjNonCJKSpacingStatistics[" "] > cjNonCJKSpacingStatistics[""]; - return sentenceNode.isCJSpacingUsing; + return sentenceNode.usesCJSpaces; } /** * @typedef {import("./utils.js").TextNode} TextNode - * @typedef {import("./utils.js").WhiteSpaceValue} WhiteSpaceValue + * @typedef {import("./utils.js").WhitespaceValue} WhitespaceValue * @typedef {{next?: TextNode | undefined | null, previous?: TextNode | undefined | null}} AdjacentNodes * @typedef {import("./utils.js").WordKind} WordKind * @typedef {import("../common/ast-path.js").default} AstPath @@ -626,11 +626,11 @@ function isSentenceUseCJDividingSpace(path) { * For example, if you would like to squash the multi-line string `"You might want\nto use Prettier."` into a single line, * you would replace `"\n"` with `" "`. (`"You might want to use Prettier."`) * - * However, you should note that Chinese and Japanese does not use U+0020 Space to divide words, so U+00A0 No-break space must not be replaced with it. + * However, you should note that Chinese and Japanese does not use U+0020 Space to divide words, so U+000A End of Line must not be replaced with it. * Behavior in other languages (e.g. Thai) will not be changed because there are too much things to consider. (PR welcome) * * @param {AstPath} path path of given node - * @param {WhiteSpaceValue} value value of given node (typically `" "` or `"\n"`) + * @param {WhitespaceValue} value value of given node (typically `" "` or `"\n"`) * @param {AdjacentNodes | undefined} adjacentNodes adjacent sibling nodes of given node * @returns {boolean} `true` if given node can be converted to space, `false` if not (i.e. newline or empty character) */ @@ -640,12 +640,10 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { return true; } // no adjacent nodes - // Use `!` (no zero, NaN, or empty string) if (!adjacentNodes) { return true; } // e.g. " \nletter" - // Use `!` (no zero, NaN, or empty string) if (!adjacentNodes.previous || !adjacentNodes.next) { return true; } @@ -695,15 +693,15 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { } // Converting newline between CJ and non-ASCII punctuation to Space does not seem to be better in many cases. (PR welcome) if ( - punctuationsRegex.test(previousLastChar) || - punctuationsRegex.test(nextFirstChar) + punctuationRegex.test(previousLastChar) || + punctuationRegex.test(nextFirstChar) ) { return false; } // 2. If sentence uses space between CJ and alphanumerics (including hangul because of backward-compatibility), // "\n" can be converted to Space. - // Note: Koran uses space to divide words, so it is difficult to determine if "\n" should be converted to Space. - return isSentenceUseCJDividingSpace(path); + // Note: Korean uses space to divide words, so it is difficult to determine if "\n" should be converted to Space. + return isInSentenceWithCJSpaces(path); } /** @@ -731,10 +729,10 @@ function isWesternOrKoreanLetter(kind) { } /** - * Returns whether “whitespace” (`"" | " " | "\n"`; see `WhiteSpaceValue`) can converted to `"\n"` + * Returns whether “whitespace” (`"" | " " | "\n"`; see `WhitespaceValue`) can converted to `"\n"` * * @param {AstPath} path - * @param {WhiteSpaceValue} value + * @param {WhitespaceValue} value * @param {*} options * @param {AdjacentNodes | undefined} [adjacentNodes] * @returns {boolean} `true` if “whitespace” can be converted to `"\n"` @@ -767,9 +765,9 @@ function isBreakable(path, value, options, adjacentNodes) { // https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages const isBreakingCJKLineBreakingRule = (adjacentNodes.next?.value !== undefined && - noBreakBeforeSymbolSet.has(adjacentNodes.next?.value?.at(0))) || + noBreakBeforeCharacterSet.has(adjacentNodes.next?.value?.at(0))) || (adjacentNodes.previous?.value !== undefined && - noBreakAfterSymbolSet.has(adjacentNodes.previous?.value?.at(-1))); + noBreakAfterCharacterSet.has(adjacentNodes.previous?.value?.at(-1))); // For "" (CJK and some non-space) higher priority than the following rule if (isBreakingCJKLineBreakingRule) { return false; @@ -779,7 +777,7 @@ function isBreakable(path, value, options, adjacentNodes) { /** * @param {AstPath} path - * @param {WhiteSpaceValue} value + * @param {WhitespaceValue} value * @param {*} options * @param {AdjacentNodes | undefined} [adjacentNodes] */ diff --git a/src/language-markdown/utils.js b/src/language-markdown/utils.js index 0358ff6e75b9..e2b265124046 100644 --- a/src/language-markdown/utils.js +++ b/src/language-markdown/utils.js @@ -42,11 +42,11 @@ const KIND_K_LETTER = "k-letter"; const KIND_CJK_PUNCTUATION = "cjk-punctuation"; /** - * @typedef {" " | "\n" | ""} WhiteSpaceValue + * @typedef {" " | "\n" | ""} WhitespaceValue * @typedef { KIND_NON_CJK | KIND_CJ_LETTER | KIND_K_LETTER | KIND_CJK_PUNCTUATION } WordKind * @typedef {{ * type: "whitespace", - * value: WhiteSpaceValue, + * value: WhitespaceValue, * kind?: undefined, * hasLeadingPunctuation?: undefined, * hasTrailingPunctuation?: undefined, From 16b60231ec222ef1e2f641e55074bb9b435be588 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Fri, 7 Oct 2022 23:24:52 +0900 Subject: [PATCH 37/87] Fix typo --- changelog_unreleased/markdown/11597.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/changelog_unreleased/markdown/11597.md b/changelog_unreleased/markdown/11597.md index 282addbedf88..3fa8c1304d86 100644 --- a/changelog_unreleased/markdown/11597.md +++ b/changelog_unreleased/markdown/11597.md @@ -4,7 +4,7 @@ The current behavior of inserting whitespace (U+0020) between Chinese or Japanes It is not the only or the official rule, so Prettier now does not force it. If the document does not apply the rule, Prettier does follow the policy. -For more information of the reason, pleasa see , which this patch is based on. +For more information of the reason, please see , which this patch is based on. ```markdown @@ -60,7 +60,7 @@ In the above sentence: - `、`, `)`, `ー`, `ょ`, and `。` must not be placed in the head of lines. - `(` must not be placed in the tail of lines. -Prettier now follows the rule when `proseWrap` is other than `presreve`. +Prettier now follows the rule when `proseWrap` is other than `preserve`. ##### 2. Do not break lines inside Korean words From 2cba42fd1943d909099ac28d36b5427bcc962673 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 8 Oct 2022 00:50:54 +0900 Subject: [PATCH 38/87] Remove unnecessary optional chaining --- src/language-markdown/printer-markdown.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 6bc63b5da803..35b80e6d46cf 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -647,8 +647,8 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { if (!adjacentNodes.previous || !adjacentNodes.next) { return true; } - const previousKind = adjacentNodes.previous?.kind; - const nextKind = adjacentNodes.next?.kind; + const previousKind = adjacentNodes.previous.kind; + const nextKind = adjacentNodes.next.kind; // "\n" between not western or Korean (han, kana, CJK punctuations) characters always can converted to Space // Korean hangul simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) if ( @@ -675,8 +675,8 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { ) { return false; } - const previousLastChar = adjacentNodes.previous?.value?.at(-1); - const nextFirstChar = adjacentNodes.next?.value?.at(0); + const previousLastChar = adjacentNodes.previous.value?.at(-1); + const nextFirstChar = adjacentNodes.next.value?.at(0); // The following rules do not precede the above rules (`return false`). // // Cases: From a4cd4f1d2c49b4fdc5335e440d1a3d13bff2e256 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 8 Oct 2022 00:51:37 +0900 Subject: [PATCH 39/87] Refactor some functions --- src/language-markdown/printer-markdown.js | 38 +++++++++++------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 35b80e6d46cf..60d8c4a605c0 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -586,23 +586,17 @@ function isInSentenceWithCJSpaces(path) { }; for (let i = 0; i < sentenceNode.children.length; ++i) { const node = sentenceNode.children[i]; - if (node.type === "whitespace") { - switch (node.value) { - case " ": - case "": { - const previous = sentenceNode.children[i - 1]; - const next = sentenceNode.children[i + 1]; - if ( - !( - (previous?.kind === "cj-letter" && next?.kind === "non-cjk") || - (previous?.kind === "non-cjk" && next?.kind === "cj-letter") - ) - ) { - continue; - } - ++cjNonCJKSpacingStatistics[node.value]; - continue; - } + if ( + node.type === "whitespace" && + (node.value === " " || node.value === "") + ) { + const previousKind = sentenceNode.children[i - 1]?.kind; + const nextKind = sentenceNode.children[i + 1]?.kind; + if ( + (previousKind === "cj-letter" && nextKind === "non-cjk") || + (previousKind === "non-cjk" && nextKind === "cj-letter") + ) { + ++cjNonCJKSpacingStatistics[node.value]; } } } @@ -709,7 +703,11 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { * @returns {boolean} `true` if `kind` is CJK (including punctuation marks) */ function isCJK(kind) { - return kind !== undefined && kind !== KIND_NON_CJK; + return ( + kind === KIND_CJ_LETTER || + kind === KIND_K_LETTER || + kind === KIND_CJK_PUNCTUATION + ); } /** @@ -717,7 +715,9 @@ function isCJK(kind) { * @returns {boolean} `true` if `kind` is letter (not CJK punctuation) */ function isLetter(kind) { - return kind !== undefined && kind !== KIND_CJK_PUNCTUATION; + return ( + kind === KIND_NON_CJK || kind === KIND_CJ_LETTER || kind === KIND_K_LETTER + ); } /** From 3a94a5d70101273b88fd9c7d06dd5b7d53bb7d0d Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 8 Oct 2022 17:59:33 +0900 Subject: [PATCH 40/87] Use bracket for index 0 instead of at --- src/language-markdown/printer-markdown.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 60d8c4a605c0..0a4885bc7886 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -670,7 +670,7 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { return false; } const previousLastChar = adjacentNodes.previous.value?.at(-1); - const nextFirstChar = adjacentNodes.next.value?.at(0); + const nextFirstChar = adjacentNodes.next.value?.[0]; // The following rules do not precede the above rules (`return false`). // // Cases: @@ -765,7 +765,7 @@ function isBreakable(path, value, options, adjacentNodes) { // https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages const isBreakingCJKLineBreakingRule = (adjacentNodes.next?.value !== undefined && - noBreakBeforeCharacterSet.has(adjacentNodes.next?.value?.at(0))) || + noBreakBeforeCharacterSet.has(adjacentNodes.next?.value?.[0])) || (adjacentNodes.previous?.value !== undefined && noBreakAfterCharacterSet.has(adjacentNodes.previous?.value?.at(-1))); // For "" (CJK and some non-space) higher priority than the following rule From c8b620d57e389a548b85c3a2508b7cd4e99ecad3 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 9 Oct 2022 00:53:03 +0900 Subject: [PATCH 41/87] Divert `punctuationRegex` in `utils.js` --- src/language-markdown/printer-markdown.js | 7 +------ src/language-markdown/utils.js | 1 + 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 0a4885bc7886..55a12d5f5283 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -33,6 +33,7 @@ import { hasGitDiffFriendlyOrderedList, splitText, punctuationPattern, + punctuationRegex, INLINE_NODE_TYPES, INLINE_NODE_WRAPPER_TYPES, isAutolink, @@ -88,12 +89,6 @@ const lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet = new Set( "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" ); -/** - * Unicode punctuation character defined in [CommonMark Spec](https://spec.commonmark.org/0.30/#unicode-punctuation-character) - */ -const punctuationRegex = - /(?:\p{Pc}|\p{Pd}|\p{Pe}|\p{Pf}|\p{Pi}|\p{Po}|\p{Ps})/u; - function genericPrint(path, options, print) { const { node } = path; diff --git a/src/language-markdown/utils.js b/src/language-markdown/utils.js index e2b265124046..27bb1cd500bf 100644 --- a/src/language-markdown/utils.js +++ b/src/language-markdown/utils.js @@ -233,6 +233,7 @@ export { mapAst, splitText, punctuationPattern, + punctuationRegex, getFencedCodeBlockValue, getOrderedListItemInfo, hasGitDiffFriendlyOrderedList, From 07015ba76c292d891c4f39405b772e453c5c78db Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 9 Oct 2022 00:57:10 +0900 Subject: [PATCH 42/87] Remove unused `options` from `splitText(IntoSentences)` --- src/language-markdown/print-preprocess.js | 6 +++--- src/language-markdown/printer-markdown.js | 3 +-- src/language-markdown/utils.js | 3 +-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/language-markdown/print-preprocess.js b/src/language-markdown/print-preprocess.js index ecc3d70d09a3..d8b6013c3e59 100644 --- a/src/language-markdown/print-preprocess.js +++ b/src/language-markdown/print-preprocess.js @@ -9,7 +9,7 @@ function preprocess(ast, options) { ast = transformInlineCode(ast, options); ast = transformIndentedCodeblockAndMarkItsParentList(ast, options); ast = markAlignedList(ast, options); - ast = splitTextIntoSentences(ast, options); + ast = splitTextIntoSentences(ast); ast = transformImportExport(ast); ast = mergeContinuousImportExport(ast); return ast; @@ -102,7 +102,7 @@ function mergeContinuousTexts(ast) { ); } -function splitTextIntoSentences(ast, options) { +function splitTextIntoSentences(ast) { return mapAst(ast, (node, index, [parentNode]) => { if (node.type !== "text") { return node; @@ -122,7 +122,7 @@ function splitTextIntoSentences(ast, options) { return { type: "sentence", position: node.position, - children: splitText(value, options), + children: splitText(value), }; }); } diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 55a12d5f5283..addaea0451fc 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -97,8 +97,7 @@ function genericPrint(path, options, print) { options.originalText.slice( node.position.start.offset, node.position.end.offset - ), - options + ) ).map((node) => node.type === "word" ? node.value diff --git a/src/language-markdown/utils.js b/src/language-markdown/utils.js index 27bb1cd500bf..dd549c9abab6 100644 --- a/src/language-markdown/utils.js +++ b/src/language-markdown/utils.js @@ -63,8 +63,7 @@ const KIND_CJK_PUNCTUATION = "cjk-punctuation"; * split text into whitespaces and words * @param {string} text */ -// eslint-disable-next-line no-unused-vars -function splitText(text, _options) { +function splitText(text) { /** @type {Array} */ const nodes = []; From 0144ffc6b0d7101ec8b88f8dac87ec1b3d54c572 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 9 Oct 2022 01:30:08 +0900 Subject: [PATCH 43/87] Fix comments --- src/language-markdown/printer-markdown.js | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index addaea0451fc..5ea6221e7c0d 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -665,14 +665,13 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { } const previousLastChar = adjacentNodes.previous.value?.at(-1); const nextFirstChar = adjacentNodes.next.value?.[0]; - // The following rules do not precede the above rules (`return false`). - // - // Cases: - // - CJ & non-CJK + + // From here down, only line breaks between CJ and alphanumeric characters are covered. + + // Convert newline between CJ and specific symbol characters (e.g. ASCII punctuation) to Space. + // e.g. :::\n句子句子句子\n::: → ::: 句子句子句子 ::: // - // 1. "\n" between special signs and CJ characters - // [corresponding sign][Space][any string][CJ][[\n]][target sign] - // we wonder if there are other marks to be considered. + // Note: Line breaks like "(\n句子句子\n)" or "句子\n." by Prettier are suppressed in `isBreakable(...)`. if ( lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has(nextFirstChar) || lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has(previousLastChar) @@ -686,9 +685,7 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { ) { return false; } - // 2. If sentence uses space between CJ and alphanumerics (including hangul because of backward-compatibility), - // "\n" can be converted to Space. - // Note: Korean uses space to divide words, so it is difficult to determine if "\n" should be converted to Space. + // If the sentence uses the style that Space is injected in between CJ and alphanumerics, "\n" can be converted to Space. return isInSentenceWithCJSpaces(path); } From 949a93d0ac1188d07a42f62fab802471765ed634 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 9 Oct 2022 12:00:47 +0900 Subject: [PATCH 44/87] Add test case to ignore trailing space --- .../markdown/splitCjkText/jsfmt.spec.js | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/tests/format/markdown/splitCjkText/jsfmt.spec.js b/tests/format/markdown/splitCjkText/jsfmt.spec.js index 8ad59f4c81ae..eb4f5df98e2e 100644 --- a/tests/format/markdown/splitCjkText/jsfmt.spec.js +++ b/tests/format/markdown/splitCjkText/jsfmt.spec.js @@ -1 +1,30 @@ -run_spec(import.meta, ["markdown"], { proseWrap: "always" }); +const code = Array.from({ length: 6 }) + // less "文" <= [..., "文...文 ", "文...文", ...] => more "文" + .map((_, i) => ["文".repeat(39 + (i >> 1)), i & 1 ? "" : " "]) + // ["", "", ...] + .reduce( + (previousInputLines, [base, space]) => [ + ...previousInputLines, + `${base}${space}\n`, + ], + [] + ) + .join(""); +// (39 + 40 + 41) * 2 === 40 * 3 * 2 === 40 * 6 +// lineWidth is 80 and "文" is double-width +const output = `${"文".repeat(40)}\n`.repeat(6); + +run_spec( + { + importMeta: import.meta, + snippets: [ + { + code, + name: "Should ignore single trailing space after han before New Line before han", + output, + }, + ], + }, + ["markdown"], + { proseWrap: "always" } +); From a68b02c48b37503905118fea74ad96305409d802 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 9 Oct 2022 14:28:07 +0900 Subject: [PATCH 45/87] Shorten variable names --- src/language-markdown/printer-markdown.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 5ea6221e7c0d..27bfa168e8ad 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -66,7 +66,7 @@ const SIBLING_NODE_TYPES = new Set([ * - Bad: `"檜原村(\nひのはらむら)"` * - Good: `"檜原村\n(ひのはらむら)"` or ``"檜原村(ひ\nのはらむら)"` */ -const noBreakAfterCharacterSet = new Set( +const noBreakAfter = new Set( "$(£¥·'\"〈《「『【〔〖〝﹙﹛$([{£¥[{‵︴︵︷︹︻︽︿﹁﹃﹏〘⦅«" ); /** @@ -77,7 +77,7 @@ const noBreakAfterCharacterSet = new Set( * - Bad: `"檜原村(ひのはらむら\n)以外には、"` * - Good: `"檜原村(ひのはらむ\nら)以外には、"` or `"檜原村(ひのはらむら)\n以外には、"` */ -const noBreakBeforeCharacterSet = new Set( +const noBreakBefore = new Set( "!%),.:;?]}¢°·'\"†‡›℃∶、。〃〆〕〗〞﹚﹜!"%'),.:;?]}~–—•〉》」︰︱︲︳﹐﹑﹒﹓﹔﹕﹖﹘︶︸︺︼︾﹀﹂﹗|、』】〙〟⦆»ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻‐゠〜~‼⁇⁈⁉・" ); /** @@ -756,9 +756,9 @@ function isBreakable(path, value, options, adjacentNodes) { // https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages const isBreakingCJKLineBreakingRule = (adjacentNodes.next?.value !== undefined && - noBreakBeforeCharacterSet.has(adjacentNodes.next?.value?.[0])) || + noBreakBefore.has(adjacentNodes.next?.value?.[0])) || (adjacentNodes.previous?.value !== undefined && - noBreakAfterCharacterSet.has(adjacentNodes.previous?.value?.at(-1))); + noBreakAfter.has(adjacentNodes.previous?.value?.at(-1))); // For "" (CJK and some non-space) higher priority than the following rule if (isBreakingCJKLineBreakingRule) { return false; From e0b34fbe3de679ea2335481a5705f0312b09fbe9 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 9 Oct 2022 14:34:24 +0900 Subject: [PATCH 46/87] Fix comments --- src/language-markdown/printer-markdown.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 27bfa168e8ad..f33cefa0af5f 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -783,8 +783,9 @@ function printLine(path, value, options, adjacentNodes) { if (value !== "\n") { return convertToLineIfBreakable(value); } - // Chinese and Japanese does not use U+0020 Space to divide words, so U+00A0 No-break space must not be replaced with it. + // Chinese and Japanese does not use U+0020 Space to divide words, so U+000A End of Line must not be replaced with it. // Behavior in other languages will not be changed because there are too much things to consider. (PR welcome) + // e.g. Word segmentation in Thai etc. return convertToLineIfBreakable( canBeConvertedToSpace(path, value, adjacentNodes) ? " " : "" ); From b92974194362f52b9d0950295929224ff767ccfe Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 9 Oct 2022 14:56:45 +0900 Subject: [PATCH 47/87] Fix comments in `canBeConvertedToSpace` --- src/language-markdown/printer-markdown.js | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index f33cefa0af5f..e2a04d9d3405 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -666,7 +666,7 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { const previousLastChar = adjacentNodes.previous.value?.at(-1); const nextFirstChar = adjacentNodes.next.value?.[0]; - // From here down, only line breaks between CJ and alphanumeric characters are covered. + // From here down, only line breaks between CJ and non-CJK characters are covered. // Convert newline between CJ and specific symbol characters (e.g. ASCII punctuation) to Space. // e.g. :::\n句子句子句子\n::: → ::: 句子句子句子 ::: @@ -679,6 +679,11 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { return true; } // Converting newline between CJ and non-ASCII punctuation to Space does not seem to be better in many cases. (PR welcome) + // e.g. + // 1. “ア〜エの中から1つ選べ。” + // "〜" (U+301C) belongs to Pd, and "\n" in "ア〜\nエの中から1つ選べ。" must not be converted to Space. + // 2. “これはひどい……なんと汚いコミットログなんだ……” + // "…" (U+2026) belongs to Po, and "\n" in "これはひどい……\nなんと汚いコミットログなんだ……" must not, either. if ( punctuationRegex.test(previousLastChar) || punctuationRegex.test(nextFirstChar) From 7765d7bff99249f611056436b3d7f2871bba8cd7 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 9 Oct 2022 15:16:26 +0900 Subject: [PATCH 48/87] Move whitespace functions and variables to new file --- src/language-markdown/printer-markdown.js | 243 +-------------------- src/language-markdown/whitespace.js | 255 ++++++++++++++++++++++ 2 files changed, 258 insertions(+), 240 deletions(-) create mode 100644 src/language-markdown/whitespace.js diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index e2a04d9d3405..c1d265f6bfe9 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -33,16 +33,12 @@ import { hasGitDiffFriendlyOrderedList, splitText, punctuationPattern, - punctuationRegex, INLINE_NODE_TYPES, INLINE_NODE_WRAPPER_TYPES, isAutolink, - KIND_CJK_PUNCTUATION, - KIND_NON_CJK, - KIND_K_LETTER, - KIND_CJ_LETTER, } from "./utils.js"; import visitorKeys from "./visitor-keys.js"; +import { canBeConvertedToSpace, isBreakable } from "./whitespace.js"; const getVisitorKeys = createGetVisitorKeys(visitorKeys); @@ -51,43 +47,11 @@ const getVisitorKeys = createGetVisitorKeys(visitorKeys); */ const TRAILING_HARDLINE_NODES = new Set(["importExport"]); -const SINGLE_LINE_NODE_TYPES = ["heading", "tableCell", "link", "wikiLink"]; const SIBLING_NODE_TYPES = new Set([ "listItem", "definition", "footnoteDefinition", ]); -// https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages -/** - * The set of characters that must not immediately precede a line break - * - * e.g. `"("` - * - * - Bad: `"檜原村(\nひのはらむら)"` - * - Good: `"檜原村\n(ひのはらむら)"` or ``"檜原村(ひ\nのはらむら)"` - */ -const noBreakAfter = new Set( - "$(£¥·'\"〈《「『【〔〖〝﹙﹛$([{£¥[{‵︴︵︷︹︻︽︿﹁﹃﹏〘⦅«" -); -/** - * The set of characters that must not immediately follow a line break - * - * e.g. `")"` - * - * - Bad: `"檜原村(ひのはらむら\n)以外には、"` - * - Good: `"檜原村(ひのはらむ\nら)以外には、"` or `"檜原村(ひのはらむら)\n以外には、"` - */ -const noBreakBefore = new Set( - "!%),.:;?]}¢°·'\"†‡›℃∶、。〃〆〕〗〞﹚﹜!"%'),.:;?]}~–—•〉》」︰︱︲︳﹐﹑﹒﹓﹔﹕﹖﹘︶︸︺︼︾﹀﹂﹗|、』】〙〟⦆»ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻‐゠〜~‼⁇⁈⁉・" -); -/** - * The set of characters whose surrounding newline may be converted to Space - * - * - ASCII punctuation marks - */ -const lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet = new Set( - "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" -); function genericPrint(path, options, print) { const { node } = path; @@ -560,46 +524,6 @@ function getAncestorNode(path, typeOrTypes) { return counter === -1 ? null : path.getParentNode(counter); } -/** - * Finds out if Space is tend to be inserted between Chinese or Japanese characters - * (including ideograph aka han or kanji e.g. `字`, hiragana e.g. `あ`, and katakana e.g. `ア`) - * and other letters (including alphanumerics; e.g. `A` or `1`) in the sentence. - * - * @param {*} path current position in nodes tree - * @returns {boolean} `true` if Space is tend to be inserted between these types of letters, `false` otherwise. - */ -function isInSentenceWithCJSpaces(path) { - const sentenceNode = getAncestorNode(path, "sentence"); - if (sentenceNode.usesCJSpaces !== undefined) { - return sentenceNode.usesCJSpaces; - } - - const cjNonCJKSpacingStatistics = { - " ": 0, - "": 0, - }; - for (let i = 0; i < sentenceNode.children.length; ++i) { - const node = sentenceNode.children[i]; - if ( - node.type === "whitespace" && - (node.value === " " || node.value === "") - ) { - const previousKind = sentenceNode.children[i - 1]?.kind; - const nextKind = sentenceNode.children[i + 1]?.kind; - if ( - (previousKind === "cj-letter" && nextKind === "non-cjk") || - (previousKind === "non-cjk" && nextKind === "cj-letter") - ) { - ++cjNonCJKSpacingStatistics[node.value]; - } - } - } - // Injects a property to cache the result. - sentenceNode.usesCJSpaces = - cjNonCJKSpacingStatistics[" "] > cjNonCJKSpacingStatistics[""]; - return sentenceNode.usesCJSpaces; -} - /** * @typedef {import("./utils.js").TextNode} TextNode * @typedef {import("./utils.js").WhitespaceValue} WhitespaceValue @@ -608,169 +532,6 @@ function isInSentenceWithCJSpaces(path) { * @typedef {import("../common/ast-path.js").default} AstPath */ -/** - * Checks if given node can be converted to Space - * - * For example, if you would like to squash the multi-line string `"You might want\nto use Prettier."` into a single line, - * you would replace `"\n"` with `" "`. (`"You might want to use Prettier."`) - * - * However, you should note that Chinese and Japanese does not use U+0020 Space to divide words, so U+000A End of Line must not be replaced with it. - * Behavior in other languages (e.g. Thai) will not be changed because there are too much things to consider. (PR welcome) - * - * @param {AstPath} path path of given node - * @param {WhitespaceValue} value value of given node (typically `" "` or `"\n"`) - * @param {AdjacentNodes | undefined} adjacentNodes adjacent sibling nodes of given node - * @returns {boolean} `true` if given node can be converted to space, `false` if not (i.e. newline or empty character) - */ -function canBeConvertedToSpace(path, value, adjacentNodes) { - // "\n" or " ", of course " " always can be converted to Space - if (value !== "\n") { - return true; - } - // no adjacent nodes - if (!adjacentNodes) { - return true; - } - // e.g. " \nletter" - if (!adjacentNodes.previous || !adjacentNodes.next) { - return true; - } - const previousKind = adjacentNodes.previous.kind; - const nextKind = adjacentNodes.next.kind; - // "\n" between not western or Korean (han, kana, CJK punctuations) characters always can converted to Space - // Korean hangul simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) - if ( - isWesternOrKoreanLetter(previousKind) && - isWesternOrKoreanLetter(nextKind) - ) { - return true; - } - // han & hangul: same way preferred - if ( - (previousKind === KIND_K_LETTER && nextKind === KIND_CJ_LETTER) || - (nextKind === KIND_K_LETTER && previousKind === KIND_CJ_LETTER) - ) { - return true; - } - // Do not convert it to Space when: - if ( - // Shall not be converted to Space around CJK punctuation - previousKind === KIND_CJK_PUNCTUATION || - nextKind === KIND_CJK_PUNCTUATION || - // "\n" between CJ always SHALL NOT be converted to space - // "\n" between Korean and CJ is better not to be converted to space - (isCJK(previousKind) && isCJK(nextKind)) - ) { - return false; - } - const previousLastChar = adjacentNodes.previous.value?.at(-1); - const nextFirstChar = adjacentNodes.next.value?.[0]; - - // From here down, only line breaks between CJ and non-CJK characters are covered. - - // Convert newline between CJ and specific symbol characters (e.g. ASCII punctuation) to Space. - // e.g. :::\n句子句子句子\n::: → ::: 句子句子句子 ::: - // - // Note: Line breaks like "(\n句子句子\n)" or "句子\n." by Prettier are suppressed in `isBreakable(...)`. - if ( - lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has(nextFirstChar) || - lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has(previousLastChar) - ) { - return true; - } - // Converting newline between CJ and non-ASCII punctuation to Space does not seem to be better in many cases. (PR welcome) - // e.g. - // 1. “ア〜エの中から1つ選べ。” - // "〜" (U+301C) belongs to Pd, and "\n" in "ア〜\nエの中から1つ選べ。" must not be converted to Space. - // 2. “これはひどい……なんと汚いコミットログなんだ……” - // "…" (U+2026) belongs to Po, and "\n" in "これはひどい……\nなんと汚いコミットログなんだ……" must not, either. - if ( - punctuationRegex.test(previousLastChar) || - punctuationRegex.test(nextFirstChar) - ) { - return false; - } - // If the sentence uses the style that Space is injected in between CJ and alphanumerics, "\n" can be converted to Space. - return isInSentenceWithCJSpaces(path); -} - -/** - * @param {WordKind | undefined} kind - * @returns {boolean} `true` if `kind` is CJK (including punctuation marks) - */ -function isCJK(kind) { - return ( - kind === KIND_CJ_LETTER || - kind === KIND_K_LETTER || - kind === KIND_CJK_PUNCTUATION - ); -} - -/** - * @param {WordKind | undefined} kind - * @returns {boolean} `true` if `kind` is letter (not CJK punctuation) - */ -function isLetter(kind) { - return ( - kind === KIND_NON_CJK || kind === KIND_CJ_LETTER || kind === KIND_K_LETTER - ); -} - -/** - * @param {WordKind | undefined} kind - * @returns {boolean} `true` if `kind` is western or Korean letters (divides words by Space) - */ -function isWesternOrKoreanLetter(kind) { - return kind === KIND_NON_CJK || kind === KIND_K_LETTER; -} - -/** - * Returns whether “whitespace” (`"" | " " | "\n"`; see `WhitespaceValue`) can converted to `"\n"` - * - * @param {AstPath} path - * @param {WhitespaceValue} value - * @param {*} options - * @param {AdjacentNodes | undefined} [adjacentNodes] - * @returns {boolean} `true` if “whitespace” can be converted to `"\n"` - */ -function isBreakable(path, value, options, adjacentNodes) { - if (options.proseWrap !== "always") { - return false; - } - if (getAncestorNode(path, SINGLE_LINE_NODE_TYPES)) { - return false; - } - if (adjacentNodes === undefined) { - return true; - } - // Space & newline are always breakable - if (value !== "") { - return true; - } - // Simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) - // [Latin][""][Hangul] & vice versa => Don't break - // [han & kana][""][hangul], either - if ( - (adjacentNodes.previous?.kind === KIND_K_LETTER && - isLetter(adjacentNodes.next?.kind)) || - (adjacentNodes.next?.kind === KIND_K_LETTER && - isLetter(adjacentNodes.previous?.kind)) - ) { - return false; - } - // https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages - const isBreakingCJKLineBreakingRule = - (adjacentNodes.next?.value !== undefined && - noBreakBefore.has(adjacentNodes.next?.value?.[0])) || - (adjacentNodes.previous?.value !== undefined && - noBreakAfter.has(adjacentNodes.previous?.value?.at(-1))); - // For "" (CJK and some non-space) higher priority than the following rule - if (isBreakingCJKLineBreakingRule) { - return false; - } - return true; -} - /** * @param {AstPath} path * @param {WhitespaceValue} value @@ -1169,6 +930,8 @@ function hasPrettierIgnore(path) { return path.index > 0 && isPrettierIgnore(path.previous) === "next"; } +export { getAncestorNode }; + const printer = { preprocess, print: genericPrint, diff --git a/src/language-markdown/whitespace.js b/src/language-markdown/whitespace.js new file mode 100644 index 000000000000..bad365a54154 --- /dev/null +++ b/src/language-markdown/whitespace.js @@ -0,0 +1,255 @@ +import { getAncestorNode } from "./printer-markdown.js"; +import { + KIND_CJK_PUNCTUATION, + KIND_CJ_LETTER, + KIND_K_LETTER, + KIND_NON_CJK, + punctuationRegex, +} from "./utils.js"; + +const SINGLE_LINE_NODE_TYPES = ["heading", "tableCell", "link", "wikiLink"]; + +// https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages +/** + * The set of characters that must not immediately precede a line break + * + * e.g. `"("` + * + * - Bad: `"檜原村(\nひのはらむら)"` + * - Good: `"檜原村\n(ひのはらむら)"` or ``"檜原村(ひ\nのはらむら)"` + */ +const noBreakAfter = new Set( + "$(£¥·'\"〈《「『【〔〖〝﹙﹛$([{£¥[{‵︴︵︷︹︻︽︿﹁﹃﹏〘⦅«" +); +/** + * The set of characters that must not immediately follow a line break + * + * e.g. `")"` + * + * - Bad: `"檜原村(ひのはらむら\n)以外には、"` + * - Good: `"檜原村(ひのはらむ\nら)以外には、"` or `"檜原村(ひのはらむら)\n以外には、"` + */ +const noBreakBefore = new Set( + "!%),.:;?]}¢°·'\"†‡›℃∶、。〃〆〕〗〞﹚﹜!"%'),.:;?]}~–—•〉》」︰︱︲︳﹐﹑﹒﹓﹔﹕﹖﹘︶︸︺︼︾﹀﹂﹗|、』】〙〟⦆»ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻‐゠〜~‼⁇⁈⁉・" +); +/** + * The set of characters whose surrounding newline may be converted to Space + * + * - ASCII punctuation marks + */ +const lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet = new Set( + "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" +); + +/** + * Finds out if Space is tend to be inserted between Chinese or Japanese characters + * (including ideograph aka han or kanji e.g. `字`, hiragana e.g. `あ`, and katakana e.g. `ア`) + * and other letters (including alphanumerics; e.g. `A` or `1`) in the sentence. + * + * @param {*} path current position in nodes tree + * @returns {boolean} `true` if Space is tend to be inserted between these types of letters, `false` otherwise. + */ +function isInSentenceWithCJSpaces(path) { + const sentenceNode = getAncestorNode(path, "sentence"); + if (sentenceNode.usesCJSpaces !== undefined) { + return sentenceNode.usesCJSpaces; + } + + const cjNonCJKSpacingStatistics = { + " ": 0, + "": 0, + }; + for (let i = 0; i < sentenceNode.children.length; ++i) { + const node = sentenceNode.children[i]; + if ( + node.type === "whitespace" && + (node.value === " " || node.value === "") + ) { + const previousKind = sentenceNode.children[i - 1]?.kind; + const nextKind = sentenceNode.children[i + 1]?.kind; + if ( + (previousKind === "cj-letter" && nextKind === "non-cjk") || + (previousKind === "non-cjk" && nextKind === "cj-letter") + ) { + ++cjNonCJKSpacingStatistics[node.value]; + } + } + } + // Injects a property to cache the result. + sentenceNode.usesCJSpaces = + cjNonCJKSpacingStatistics[" "] > cjNonCJKSpacingStatistics[""]; + return sentenceNode.usesCJSpaces; +} + +/** + * @typedef {import("./utils.js").TextNode} TextNode + * @typedef {import("./utils.js").WhitespaceValue} WhitespaceValue + * @typedef {{next?: TextNode | undefined | null, previous?: TextNode | undefined | null}} AdjacentNodes + * @typedef {import("./utils.js").WordKind} WordKind + * @typedef {import("../common/ast-path.js").default} AstPath + */ + +/** + * Checks if given node can be converted to Space + * + * For example, if you would like to squash the multi-line string `"You might want\nto use Prettier."` into a single line, + * you would replace `"\n"` with `" "`. (`"You might want to use Prettier."`) + * + * However, you should note that Chinese and Japanese does not use U+0020 Space to divide words, so U+000A End of Line must not be replaced with it. + * Behavior in other languages (e.g. Thai) will not be changed because there are too much things to consider. (PR welcome) + * + * @param {AstPath} path path of given node + * @param {WhitespaceValue} value value of given node (typically `" "` or `"\n"`) + * @param {AdjacentNodes | undefined} adjacentNodes adjacent sibling nodes of given node + * @returns {boolean} `true` if given node can be converted to space, `false` if not (i.e. newline or empty character) + */ +function canBeConvertedToSpace(path, value, adjacentNodes) { + // "\n" or " ", of course " " always can be converted to Space + if (value !== "\n") { + return true; + } + // no adjacent nodes + if (!adjacentNodes) { + return true; + } + // e.g. " \nletter" + if (!adjacentNodes.previous || !adjacentNodes.next) { + return true; + } + const previousKind = adjacentNodes.previous.kind; + const nextKind = adjacentNodes.next.kind; + // "\n" between not western or Korean (han, kana, CJK punctuations) characters always can converted to Space + // Korean hangul simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) + if ( + isWesternOrKoreanLetter(previousKind) && + isWesternOrKoreanLetter(nextKind) + ) { + return true; + } + // han & hangul: same way preferred + if ( + (previousKind === KIND_K_LETTER && nextKind === KIND_CJ_LETTER) || + (nextKind === KIND_K_LETTER && previousKind === KIND_CJ_LETTER) + ) { + return true; + } + // Do not convert it to Space when: + if ( + // Shall not be converted to Space around CJK punctuation + previousKind === KIND_CJK_PUNCTUATION || + nextKind === KIND_CJK_PUNCTUATION || + // "\n" between CJ always SHALL NOT be converted to space + // "\n" between Korean and CJ is better not to be converted to space + (isCJK(previousKind) && isCJK(nextKind)) + ) { + return false; + } + const previousLastChar = adjacentNodes.previous.value?.at(-1); + const nextFirstChar = adjacentNodes.next.value?.[0]; + + // From here down, only line breaks between CJ and non-CJK characters are covered. + + // Convert newline between CJ and specific symbol characters (e.g. ASCII punctuation) to Space. + // e.g. :::\n句子句子句子\n::: → ::: 句子句子句子 ::: + // + // Note: Line breaks like "(\n句子句子\n)" or "句子\n." by Prettier are suppressed in `isBreakable(...)`. + if ( + lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has(nextFirstChar) || + lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has(previousLastChar) + ) { + return true; + } + // Converting newline between CJ and non-ASCII punctuation to Space does not seem to be better in many cases. (PR welcome) + // e.g. + // 1. “ア〜エの中から1つ選べ。” + // "〜" (U+301C) belongs to Pd, and "\n" in "ア〜\nエの中から1つ選べ。" must not be converted to Space. + // 2. “これはひどい……なんと汚いコミットログなんだ……” + // "…" (U+2026) belongs to Po, and "\n" in "これはひどい……\nなんと汚いコミットログなんだ……" must not, either. + if ( + punctuationRegex.test(previousLastChar) || + punctuationRegex.test(nextFirstChar) + ) { + return false; + } + // If the sentence uses the style that Space is injected in between CJ and alphanumerics, "\n" can be converted to Space. + return isInSentenceWithCJSpaces(path); +} + +/** + * @param {WordKind | undefined} kind + * @returns {boolean} `true` if `kind` is CJK (including punctuation marks) + */ +function isCJK(kind) { + return ( + kind === KIND_CJ_LETTER || + kind === KIND_K_LETTER || + kind === KIND_CJK_PUNCTUATION + ); +} + +/** + * @param {WordKind | undefined} kind + * @returns {boolean} `true` if `kind` is letter (not CJK punctuation) + */ +function isLetter(kind) { + return ( + kind === KIND_NON_CJK || kind === KIND_CJ_LETTER || kind === KIND_K_LETTER + ); +} + +/** + * @param {WordKind | undefined} kind + * @returns {boolean} `true` if `kind` is western or Korean letters (divides words by Space) + */ +function isWesternOrKoreanLetter(kind) { + return kind === KIND_NON_CJK || kind === KIND_K_LETTER; +} + +/** + * Returns whether “whitespace” (`"" | " " | "\n"`; see `WhitespaceValue`) can converted to `"\n"` + * + * @param {AstPath} path + * @param {WhitespaceValue} value + * @param {*} options + * @param {AdjacentNodes | undefined} [adjacentNodes] + * @returns {boolean} `true` if “whitespace” can be converted to `"\n"` + */ +function isBreakable(path, value, options, adjacentNodes) { + if (options.proseWrap !== "always") { + return false; + } + if (getAncestorNode(path, SINGLE_LINE_NODE_TYPES)) { + return false; + } + if (adjacentNodes === undefined) { + return true; + } + // Space & newline are always breakable + if (value !== "") { + return true; + } + // Simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) + // [Latin][""][Hangul] & vice versa => Don't break + // [han & kana][""][hangul], either + if ( + (adjacentNodes.previous?.kind === KIND_K_LETTER && + isLetter(adjacentNodes.next?.kind)) || + (adjacentNodes.next?.kind === KIND_K_LETTER && + isLetter(adjacentNodes.previous?.kind)) + ) { + return false; + } + // https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages + const isBreakingCJKLineBreakingRule = + (adjacentNodes.next?.value !== undefined && + noBreakBefore.has(adjacentNodes.next?.value?.[0])) || + (adjacentNodes.previous?.value !== undefined && + noBreakAfter.has(adjacentNodes.previous?.value?.at(-1))); + // For "" (CJK and some non-space) higher priority than the following rule + if (isBreakingCJKLineBreakingRule) { + return false; + } + return true; +} + +export { canBeConvertedToSpace, isBreakable }; From 466f91e9e6fb14ff3f0f7055d403df78af915aaf Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 9 Oct 2022 18:30:12 +0900 Subject: [PATCH 49/87] Shorten so long variable name --- src/language-markdown/whitespace.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/language-markdown/whitespace.js b/src/language-markdown/whitespace.js index bad365a54154..b50853b26bf1 100644 --- a/src/language-markdown/whitespace.js +++ b/src/language-markdown/whitespace.js @@ -37,7 +37,7 @@ const noBreakBefore = new Set( * * - ASCII punctuation marks */ -const lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet = new Set( +const lineBreakBetweenTheseAndCJKConvertsToSpace = new Set( "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" ); @@ -154,8 +154,8 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { // // Note: Line breaks like "(\n句子句子\n)" or "句子\n." by Prettier are suppressed in `isBreakable(...)`. if ( - lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has(nextFirstChar) || - lineBreakBetweenTheseAndCJKConvertToSpaceSymbolSet.has(previousLastChar) + lineBreakBetweenTheseAndCJKConvertsToSpace.has(nextFirstChar) || + lineBreakBetweenTheseAndCJKConvertsToSpace.has(previousLastChar) ) { return true; } From 4b7c07aec9e1bc639fb5e27a8214943d15530962 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 9 Oct 2022 22:50:56 +0900 Subject: [PATCH 50/87] Ensure `WhitespaceNode` is not adjacent to another one --- src/language-markdown/printer-markdown.js | 5 +++-- src/language-markdown/utils.js | 21 +++++++++++++++++++-- src/language-markdown/whitespace.js | 10 ++++++---- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index c1d265f6bfe9..75f813540966 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -135,6 +135,7 @@ function genericPrint(path, options, print) { return escapedValue; } case "whitespace": { + // WordNode or nullish const { next, previous } = path; const proseWrap = @@ -525,9 +526,9 @@ function getAncestorNode(path, typeOrTypes) { } /** - * @typedef {import("./utils.js").TextNode} TextNode + * @typedef {import("./utils.js").WordNode} WordNode * @typedef {import("./utils.js").WhitespaceValue} WhitespaceValue - * @typedef {{next?: TextNode | undefined | null, previous?: TextNode | undefined | null}} AdjacentNodes + * @typedef {import("./whitespace").AdjacentNodes} AdjacentNodes * @typedef {import("./utils.js").WordKind} WordKind * @typedef {import("../common/ast-path.js").default} AstPath */ diff --git a/src/language-markdown/utils.js b/src/language-markdown/utils.js index dd549c9abab6..5ba67a137063 100644 --- a/src/language-markdown/utils.js +++ b/src/language-markdown/utils.js @@ -50,13 +50,16 @@ const KIND_CJK_PUNCTUATION = "cjk-punctuation"; * kind?: undefined, * hasLeadingPunctuation?: undefined, * hasTrailingPunctuation?: undefined, - * } | { + * }} WhitespaceNode + * @typedef {{ * type: "word", * value: string, * kind: WordKind, * hasLeadingPunctuation: boolean, * hasTrailingPunctuation: boolean, - * }} TextNode + * }} WordNode + * Node for a single CJK character or a sequence of non-CJK characters + * @typedef {WhitespaceNode | WordNode} TextNode */ /** @@ -129,6 +132,20 @@ function splitText(text) { } } + // Check for `canBeConvertedToSpace` in ./whitespace.js etc. + if (process.env.NODE_ENV !== "production") { + for (let i = 1; i < nodes.length; i++) { + if ( + nodes[i].type === "whitespace" && + nodes[i - 1].type === "whitespace" + ) { + throw new Error( + "Internal error: splitText should not create consecutive whitespace nodes" + ); + } + } + } + return nodes; function appendNode(node) { diff --git a/src/language-markdown/whitespace.js b/src/language-markdown/whitespace.js index b50853b26bf1..2ac47b60e8e0 100644 --- a/src/language-markdown/whitespace.js +++ b/src/language-markdown/whitespace.js @@ -81,10 +81,12 @@ function isInSentenceWithCJSpaces(path) { return sentenceNode.usesCJSpaces; } +// /** - * @typedef {import("./utils.js").TextNode} TextNode + * @typedef {import("./utils.js").WordNode} WordNode * @typedef {import("./utils.js").WhitespaceValue} WhitespaceValue - * @typedef {{next?: TextNode | undefined | null, previous?: TextNode | undefined | null}} AdjacentNodes + * @typedef {{next?: WordNode | undefined | null, previous?: WordNode | undefined | null}} AdjacentNodes + * Adjacent node to `WhitespaceNode`. the consecution of `WhitespaceNode` is a bug, so adjacent nodes must be `WordNode`. * @typedef {import("./utils.js").WordKind} WordKind * @typedef {import("../common/ast-path.js").default} AstPath */ @@ -144,8 +146,8 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { ) { return false; } - const previousLastChar = adjacentNodes.previous.value?.at(-1); - const nextFirstChar = adjacentNodes.next.value?.[0]; + const previousLastChar = adjacentNodes.previous.value.at(-1); + const nextFirstChar = adjacentNodes.next.value[0]; // From here down, only line breaks between CJ and non-CJK characters are covered. From 0bdf14e76c696ba3c61c39ee19edeea2340f1eea Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 9 Oct 2022 23:09:21 +0900 Subject: [PATCH 51/87] Rename and move `printLine` (new name: `printWhitespace`) --- src/language-markdown/printer-markdown.js | 49 ++--------------------- src/language-markdown/whitespace.js | 42 ++++++++++++++++++- 2 files changed, 44 insertions(+), 47 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 75f813540966..3ea5cf017bb4 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -38,7 +38,7 @@ import { isAutolink, } from "./utils.js"; import visitorKeys from "./visitor-keys.js"; -import { canBeConvertedToSpace, isBreakable } from "./whitespace.js"; +import { printWhitespace } from "./whitespace.js"; const getVisitorKeys = createGetVisitorKeys(visitorKeys); @@ -67,7 +67,7 @@ function genericPrint(path, options, print) { ? node.value : node.value === "" ? "" - : printLine(path, node.value, options) + : printWhitespace(path, node.value, options) ); } @@ -144,7 +144,7 @@ function genericPrint(path, options, print) { ? "never" : options.proseWrap; - return printLine(path, node.value, { proseWrap }, { previous, next }); + return printWhitespace(path, node.value, { proseWrap }, { previous, next }); } case "emphasis": { let style; @@ -525,49 +525,6 @@ function getAncestorNode(path, typeOrTypes) { return counter === -1 ? null : path.getParentNode(counter); } -/** - * @typedef {import("./utils.js").WordNode} WordNode - * @typedef {import("./utils.js").WhitespaceValue} WhitespaceValue - * @typedef {import("./whitespace").AdjacentNodes} AdjacentNodes - * @typedef {import("./utils.js").WordKind} WordKind - * @typedef {import("../common/ast-path.js").default} AstPath - */ - -/** - * @param {AstPath} path - * @param {WhitespaceValue} value - * @param {*} options - * @param {AdjacentNodes | undefined} [adjacentNodes] - */ -function printLine(path, value, options, adjacentNodes) { - if (options.proseWrap === "preserve" && value === "\n") { - return hardline; - } - - const isBreakable_ = isBreakable(path, value, options, adjacentNodes); - - // Space or empty - if (value !== "\n") { - return convertToLineIfBreakable(value); - } - // Chinese and Japanese does not use U+0020 Space to divide words, so U+000A End of Line must not be replaced with it. - // Behavior in other languages will not be changed because there are too much things to consider. (PR welcome) - // e.g. Word segmentation in Thai etc. - return convertToLineIfBreakable( - canBeConvertedToSpace(path, value, adjacentNodes) ? " " : "" - ); - - /** - * @param value {" " | ""} - */ - function convertToLineIfBreakable(value) { - if (!isBreakable_) { - return value; - } - return value === " " ? line : softline; - } -} - function printTable(path, options, print) { const { node } = path; diff --git a/src/language-markdown/whitespace.js b/src/language-markdown/whitespace.js index 2ac47b60e8e0..fc4dc303bbf7 100644 --- a/src/language-markdown/whitespace.js +++ b/src/language-markdown/whitespace.js @@ -1,3 +1,4 @@ +import { hardline, line, softline } from "../document/builders.js"; import { getAncestorNode } from "./printer-markdown.js"; import { KIND_CJK_PUNCTUATION, @@ -254,4 +255,43 @@ function isBreakable(path, value, options, adjacentNodes) { return true; } -export { canBeConvertedToSpace, isBreakable }; +/** + * @param {AstPath} path + * @param {WhitespaceValue} value + * @param {*} options + * @param {AdjacentNodes | undefined} [adjacentNodes] + */ +function printWhitespace(path, value, options, adjacentNodes) { + if (options.proseWrap === "preserve" && value === "\n") { + return hardline; + } + + const isBreakable_ = isBreakable(path, value, options, adjacentNodes); + + // Space or empty + if (value !== "\n") { + return convertToLineIfBreakable(value); + } + // Chinese and Japanese does not use U+0020 Space to divide words, so U+000A End of Line must not be replaced with it. + // Behavior in other languages will not be changed because there are too much things to consider. (PR welcome) + // e.g. Word segmentation in Thai etc. + return convertToLineIfBreakable( + canBeConvertedToSpace(path, value, adjacentNodes) ? " " : "" + ); + + /** + * @param value {" " | ""} + */ + function convertToLineIfBreakable(value) { + if (!isBreakable_) { + return value; + } + return value === " " ? line : softline; + } +} + +export { + canBeConvertedToSpace, + isBreakable, + printWhitespace, +}; From 1f8093dcdbb47d8fbab96dd951bd99899820e749 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 9 Oct 2022 23:12:20 +0900 Subject: [PATCH 52/87] Make variable names clerer ones --- src/language-markdown/whitespace.js | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/language-markdown/whitespace.js b/src/language-markdown/whitespace.js index fc4dc303bbf7..f7657e1b2cc8 100644 --- a/src/language-markdown/whitespace.js +++ b/src/language-markdown/whitespace.js @@ -147,8 +147,8 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { ) { return false; } - const previousLastChar = adjacentNodes.previous.value.at(-1); - const nextFirstChar = adjacentNodes.next.value[0]; + const characterBefore = adjacentNodes.previous.value.at(-1); + const characterAfter = adjacentNodes.next.value[0]; // From here down, only line breaks between CJ and non-CJK characters are covered. @@ -157,8 +157,8 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { // // Note: Line breaks like "(\n句子句子\n)" or "句子\n." by Prettier are suppressed in `isBreakable(...)`. if ( - lineBreakBetweenTheseAndCJKConvertsToSpace.has(nextFirstChar) || - lineBreakBetweenTheseAndCJKConvertsToSpace.has(previousLastChar) + lineBreakBetweenTheseAndCJKConvertsToSpace.has(characterAfter) || + lineBreakBetweenTheseAndCJKConvertsToSpace.has(characterBefore) ) { return true; } @@ -169,8 +169,8 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { // 2. “これはひどい……なんと汚いコミットログなんだ……” // "…" (U+2026) belongs to Po, and "\n" in "これはひどい……\nなんと汚いコミットログなんだ……" must not, either. if ( - punctuationRegex.test(previousLastChar) || - punctuationRegex.test(nextFirstChar) + punctuationRegex.test(characterBefore) || + punctuationRegex.test(characterAfter) ) { return false; } From 6db7406817ae865964d559f1f47d20cdb63d11d6 Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Sun, 9 Oct 2022 20:54:31 +0300 Subject: [PATCH 53/87] Edit changelog entry --- changelog_unreleased/markdown/11597.md | 64 +++++++++----------------- 1 file changed, 23 insertions(+), 41 deletions(-) diff --git a/changelog_unreleased/markdown/11597.md b/changelog_unreleased/markdown/11597.md index 3fa8c1304d86..009e4df88f14 100644 --- a/changelog_unreleased/markdown/11597.md +++ b/changelog_unreleased/markdown/11597.md @@ -1,15 +1,16 @@ -#### No more inserting space between Chinese or Japanese (e.g. hanzi and kana) and western characters (#11597 by @tats-u) +#### [HIGHLIGHT] Improve handling of whitespace for Chinese, Japanese, and Korean (#11597 by @tats-u) -The current behavior of inserting whitespace (U+0020) between Chinese or Japanese (e.g. hanzi/kanji and kana) and western (e.g. alphanumerics) characters is not based on the official layout guidelines in Japanese and Chinese but [non-standard and local one in Chinese](https://github.com/ruanyf/document-style-guide/blob/master/docs/text.md). +##### Stop inserting spaces between Chinese or Japanese and Western characters -It is not the only or the official rule, so Prettier now does not force it. If the document does not apply the rule, Prettier does follow the policy. +Previously, Prettier would insert spaces between Chinese or Japanese and Western characters (letters and digits). While some people prefer this style, it isn’t standard, and is in fact contrary to official guidelines. Please see [here](https://github.com/tats-u/prettier-plugin-md-nocjsp#why-this-plugin-is-needed) for more details. We decided it’s not Prettier’s job to enforce a particular style in this case, so spaces aren’t inserted anymore, while existing ones are preserved. -For more information of the reason, please see , which this patch is based on. +The tricky part of this change were ambiguous line breaks between Chinese or Japanese and Western characters. When Prettier unwraps text, it needs to decide whether such a line break should be simply removed or replaced with a space. For that Prettier examines the surrounding text and infers the preferred style. ```markdown -漢字Alphabetsひらがな12345カタカナ67890한글 +漢字 +Alphabetsひらがな12345カタカナ67890한글 漢字 Alphabets ひらがな 12345 カタカナ 67890 한글 @@ -24,14 +25,9 @@ For more information of the reason, please see -- Chinese: +There are rules that prohibit certain characters from appearing at the beginning or the end of a line in [Chinese](https://www.w3.org/TR/clreq/#prohibition_rules_for_line_start_end) and [Japanese](https://www.w3.org/TR/jlreq/#characters_not_starting_a_line). E.g., full stop characters `。`, `.`, and `.` shouldn’t start a line. 👎Bad: @@ -57,50 +53,36 @@ In Chinese and Japanese, some characters (e.g. full stop characters `。`, `.` In the above sentence: -- `、`, `)`, `ー`, `ょ`, and `。` must not be placed in the head of lines. -- `(` must not be placed in the tail of lines. +- `、`, `)`, `ー`, `ょ`, and `。` must not be placed at the beginning of a line. +- `(` must not be placed at the end of a line. -Prettier now follows the rule when `proseWrap` is other than `preserve`. +Prettier now follows these rules when `proseWrap` is other than `preserve`. -##### 2. Do not break lines inside Korean words +##### Do not break lines inside Korean words -This fixes #6516. Korean uses space to divide words or clauses, and inappropriate words (or clauses) divisions may change the meanings of sentences. +Korean uses spaces to divide words, and an inappropriate division may change the meaning of a sentence: -- `노래를 못해요.`: I'm not good at singing. -- `노래를 못 해요.`: I can't sing (for some reason). +- `노래를 못해요.`: I’m not good at singing. +- `노래를 못 해요.`: I can’t sing (for some reason). -Applying this patch, when `proseWrap` is other than `preserve`, Korean sentences are now formatted like those of European languages; successive (i.e. not divided by space) Hangul characters are no longer divided by the newline (U+000A), which may be converted to the space (U+0020) when the content of the document is modified. +Previously, when `proseWrap` was set to `always`, successive Hangul characters could get split by a line break, which could later be converted to a space when the document is edited and reformatted. This doesn’t happen anymore. Korean text is now wrapped like English. -`proseWrap` = `always` & `printWidth` = `9`: - -Input: - -```text + +```markdown + 노래를 못해요. -``` - -Before this patch: -```text + 노래를 못 해요. -``` - -👎Reformat “Before this patch” with `printWidth` = `80`: -```text + 노래를 못 해요. -``` -After this patch: - -```text + 노래를 못해요. -``` -👍Reformat “After this patch” with `printWidth` = `80`: - -```text + 노래를 못해요. ``` From b88858cb335d2995a5d73a43d059e0b9d26933ad Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Sun, 9 Oct 2022 21:15:20 +0300 Subject: [PATCH 54/87] Reformat --- src/language-markdown/printer-markdown.js | 7 ++++++- src/language-markdown/whitespace.js | 6 +----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 3ea5cf017bb4..bfe0f3dd3d78 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -144,7 +144,12 @@ function genericPrint(path, options, print) { ? "never" : options.proseWrap; - return printWhitespace(path, node.value, { proseWrap }, { previous, next }); + return printWhitespace( + path, + node.value, + { proseWrap }, + { previous, next } + ); } case "emphasis": { let style; diff --git a/src/language-markdown/whitespace.js b/src/language-markdown/whitespace.js index f7657e1b2cc8..d9affa86f00f 100644 --- a/src/language-markdown/whitespace.js +++ b/src/language-markdown/whitespace.js @@ -290,8 +290,4 @@ function printWhitespace(path, value, options, adjacentNodes) { } } -export { - canBeConvertedToSpace, - isBreakable, - printWhitespace, -}; +export { printWhitespace }; From 2f3e8590b5b2b86e8e715ece0c0a10035b256e09 Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Sun, 9 Oct 2022 21:16:03 +0300 Subject: [PATCH 55/87] Rename whitespace.js to print-whitespace.js --- src/language-markdown/{whitespace.js => print-whitespace.js} | 0 src/language-markdown/printer-markdown.js | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename src/language-markdown/{whitespace.js => print-whitespace.js} (100%) diff --git a/src/language-markdown/whitespace.js b/src/language-markdown/print-whitespace.js similarity index 100% rename from src/language-markdown/whitespace.js rename to src/language-markdown/print-whitespace.js diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index bfe0f3dd3d78..cb3ca9c85483 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -38,7 +38,7 @@ import { isAutolink, } from "./utils.js"; import visitorKeys from "./visitor-keys.js"; -import { printWhitespace } from "./whitespace.js"; +import { printWhitespace } from "./print-whitespace.js"; const getVisitorKeys = createGetVisitorKeys(visitorKeys); From 9f11bd196081abf238fc33247fe398f23c1e60f9 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Mon, 10 Oct 2022 16:19:54 +0900 Subject: [PATCH 56/87] Improve changelog --- changelog_unreleased/markdown/11597.md | 64 ++++++++++++++++++-------- cspell.json | 1 + 2 files changed, 46 insertions(+), 19 deletions(-) diff --git a/changelog_unreleased/markdown/11597.md b/changelog_unreleased/markdown/11597.md index 009e4df88f14..0e5dae5f1c3d 100644 --- a/changelog_unreleased/markdown/11597.md +++ b/changelog_unreleased/markdown/11597.md @@ -29,34 +29,54 @@ Alphabetsひらがな12345カタカナ67890한글 There are rules that prohibit certain characters from appearing at the beginning or the end of a line in [Chinese](https://www.w3.org/TR/clreq/#prohibition_rules_for_line_start_end) and [Japanese](https://www.w3.org/TR/jlreq/#characters_not_starting_a_line). E.g., full stop characters `。`, `.`, and `.` shouldn’t start a line. -👎Bad: - -```text -日本語組版には -、禁則処理( -きんそくしょり -)と呼ばれるル -ールがあります + +```markdown + +HTCPCPのエラー418は、ティーポットにコーヒーを淹(い)れさせようとしたときに返されるステータスコードだ。 + + + + +HTCPCP の +エラー +418 は、 +ティーポ +ットにコ +ーヒーを +淹(い) +れさせよ +うとした +ときに返 +されるス +テータス +コードだ 。 -``` -👍Good: -```text -日本語組版に -は、禁則処理 -(きんそくしょ -り)と呼ばれる -ルールがありま -す。 + +HTCPCPの +エラー +418は、 +ティー +ポットに +コーヒー +を淹 +(い)れ +させよう +としたと +きに返さ +れるス +テータス +コード +だ。 ``` In the above sentence: -- `、`, `)`, `ー`, `ょ`, and `。` must not be placed at the beginning of a line. +- `ィ`, `ッ`, `)`, `ー`, `、`, and `。` must not be placed at the beginning of a line. - `(` must not be placed at the end of a line. -Prettier now follows these rules when `proseWrap` is other than `preserve`. +Prettier now follows these rules when `proseWrap` is `always` unless the original documents break the rules on purpose. If they break, Prettier respects the policy (doesn’t force the rules). If you have documents not conforming to the rules due to old versions of Prettier, you want to fix all violation places by hand and reformat them using the latest version. ##### Do not break lines inside Korean words @@ -86,3 +106,9 @@ Previously, when `proseWrap` was set to `always`, successive Hangul characters c 노래를 못해요. ``` + +By this change, line break between hangul and letters in other languages will produce a space if two lines around it are merged into a single line. For example: + +> 3분 기다려 주지. + +In this sentence, you shouldn’t break lines between “3” and “분”, or a space will be inserted there when they’re merged into a single line again. diff --git a/cspell.json b/cspell.json index 51ba6ea13f07..8d8bee0c64d8 100644 --- a/cspell.json +++ b/cspell.json @@ -132,6 +132,7 @@ "hljs", "hlsson", "Horky", + "htcpcp", "htmlblock", "htmlhint", "hugomrdias", From ae587bd005c92318f35e8bbdb0abdc86f30c9119 Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Mon, 10 Oct 2022 16:43:40 +0300 Subject: [PATCH 57/87] Edit changelog entry --- changelog_unreleased/markdown/11597.md | 29 ++++++++++++-------------- cspell.json | 2 +- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/changelog_unreleased/markdown/11597.md b/changelog_unreleased/markdown/11597.md index 0e5dae5f1c3d..608fb0174c3c 100644 --- a/changelog_unreleased/markdown/11597.md +++ b/changelog_unreleased/markdown/11597.md @@ -2,7 +2,7 @@ ##### Stop inserting spaces between Chinese or Japanese and Western characters -Previously, Prettier would insert spaces between Chinese or Japanese and Western characters (letters and digits). While some people prefer this style, it isn’t standard, and is in fact contrary to official guidelines. Please see [here](https://github.com/tats-u/prettier-plugin-md-nocjsp#why-this-plugin-is-needed) for more details. We decided it’s not Prettier’s job to enforce a particular style in this case, so spaces aren’t inserted anymore, while existing ones are preserved. +Previously, Prettier would insert spaces between Chinese or Japanese and Western characters (letters and digits). While some people prefer this style, it isn’t standard, and is in fact contrary to official guidelines. Please see [here](https://github.com/tats-u/prettier-plugin-md-nocjsp#why-this-plugin-is-needed) for more details. We decided it’s not Prettier’s job to enforce a particular style in this case, so spaces aren’t inserted anymore, while existing ones are preserved. If you need a tool for enforcing spacing style, consider (textlint-ja)[https://github.com/textlint-ja/textlint-rule-preset-ja-spacing/tree/master/packages/textlint-rule-ja-space-between-half-and-full-width]. The tricky part of this change were ambiguous line breaks between Chinese or Japanese and Western characters. When Prettier unwraps text, it needs to decide whether such a line break should be simply removed or replaced with a space. For that Prettier examines the surrounding text and infers the preferred style. @@ -10,19 +10,19 @@ The tricky part of this change were ambiguous line breaks between Chinese or Jap ```markdown 漢字 -Alphabetsひらがな12345カタカナ67890한글 +Alphabetsひらがな12345カタカナ67890 -漢字 Alphabets ひらがな 12345 カタカナ 67890 한글 +漢字 Alphabets ひらがな 12345 カタカナ 67890 -漢字 Alphabets ひらがな 12345 カタカナ 67890한글 +漢字 Alphabets ひらがな 12345 カタカナ 67890 -漢字 Alphabets ひらがな 12345 カタカナ 67890 한글 +漢字 Alphabets ひらがな 12345 カタカナ 67890 -漢字Alphabetsひらがな12345カタカナ67890한글 +漢字Alphabetsひらがな12345カタカナ67890 -漢字 Alphabets ひらがな 12345 カタカナ 67890 한글 +漢字 Alphabets ひらがな 12345 カタカナ 67890 ``` ##### Comply to line breaking rules in Chinese and Japanese @@ -34,9 +34,7 @@ There are rules that prohibit certain characters from appearing at the beginning HTCPCPのエラー418は、ティーポットにコーヒーを淹(い)れさせようとしたときに返されるステータスコードだ。 - - - + HTCPCP の エラー 418 は、 @@ -52,8 +50,7 @@ HTCPCP の コードだ 。 - - + HTCPCPの エラー 418は、 @@ -71,12 +68,12 @@ HTCPCPの だ。 ``` -In the above sentence: +In the example above: - `ィ`, `ッ`, `)`, `ー`, `、`, and `。` must not be placed at the beginning of a line. - `(` must not be placed at the end of a line. -Prettier now follows these rules when `proseWrap` is `always` unless the original documents break the rules on purpose. If they break, Prettier respects the policy (doesn’t force the rules). If you have documents not conforming to the rules due to old versions of Prettier, you want to fix all violation places by hand and reformat them using the latest version. +Prettier now follows these rules when it wraps text, that is when `proseWrap` is set to `always`. However, if the input text violates these rules, Prettier doesn’t try to fix that and considers the whitespace intentional. In particular, that might happen if the text was formatted with an older version of Prettier, in which case you might need to correct those issues by hand or with another tool. ##### Do not break lines inside Korean words @@ -107,8 +104,8 @@ Previously, when `proseWrap` was set to `always`, successive Hangul characters c 노래를 못해요. ``` -By this change, line break between hangul and letters in other languages will produce a space if two lines around it are merged into a single line. For example: +A line break between Hangul and non-Hangul letters and digits is converted to a space when Prettier unwraps the text. Consider this example: > 3분 기다려 주지. -In this sentence, you shouldn’t break lines between “3” and “분”, or a space will be inserted there when they’re merged into a single line again. +In this sentence, if you break the line between “3” and “분”, a space will be inserted there when the text gets unwrapped. diff --git a/cspell.json b/cspell.json index 8d8bee0c64d8..740a76de746e 100644 --- a/cspell.json +++ b/cspell.json @@ -132,7 +132,6 @@ "hljs", "hlsson", "Horky", - "htcpcp", "htmlblock", "htmlhint", "hugomrdias", @@ -320,6 +319,7 @@ "templating", "tempy", "testname", + "textlint", "tldr", "Tomasek", "toplevel", From def9211011b5423e957ef57d9c945bff7afea688 Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Mon, 10 Oct 2022 17:03:07 +0300 Subject: [PATCH 58/87] Update cspell.json --- cspell.json | 1 - 1 file changed, 1 deletion(-) diff --git a/cspell.json b/cspell.json index 740a76de746e..9a17f438e1ce 100644 --- a/cspell.json +++ b/cspell.json @@ -122,7 +122,6 @@ "gofmt", "Gregor", "Hampus", - "hanzi", "hardline", "hardlines", "hashbang", From e3b98d5e9581dd665d5a80f00cc2774d38f5d357 Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Mon, 10 Oct 2022 17:08:49 +0300 Subject: [PATCH 59/87] Rename var, format --- src/language-markdown/print-whitespace.js | 40 ++++++++++++++++------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index d9affa86f00f..c275bcebd517 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -22,6 +22,7 @@ const SINGLE_LINE_NODE_TYPES = ["heading", "tableCell", "link", "wikiLink"]; const noBreakAfter = new Set( "$(£¥·'\"〈《「『【〔〖〝﹙﹛$([{£¥[{‵︴︵︷︹︻︽︿﹁﹃﹏〘⦅«" ); + /** * The set of characters that must not immediately follow a line break * @@ -33,6 +34,7 @@ const noBreakAfter = new Set( const noBreakBefore = new Set( "!%),.:;?]}¢°·'\"†‡›℃∶、。〃〆〕〗〞﹚﹜!"%'),.:;?]}~–—•〉》」︰︱︲︳﹐﹑﹒﹓﹔﹕﹖﹘︶︸︺︼︾﹀﹂﹗|、』】〙〟⦆»ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻‐゠〜~‼⁇⁈⁉・" ); + /** * The set of characters whose surrounding newline may be converted to Space * @@ -82,7 +84,6 @@ function isInSentenceWithCJSpaces(path) { return sentenceNode.usesCJSpaces; } -// /** * @typedef {import("./utils.js").WordNode} WordNode * @typedef {import("./utils.js").WhitespaceValue} WhitespaceValue @@ -111,16 +112,20 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { if (value !== "\n") { return true; } + // no adjacent nodes if (!adjacentNodes) { return true; } + // e.g. " \nletter" if (!adjacentNodes.previous || !adjacentNodes.next) { return true; } + const previousKind = adjacentNodes.previous.kind; const nextKind = adjacentNodes.next.kind; + // "\n" between not western or Korean (han, kana, CJK punctuations) characters always can converted to Space // Korean hangul simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) if ( @@ -129,6 +134,7 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { ) { return true; } + // han & hangul: same way preferred if ( (previousKind === KIND_K_LETTER && nextKind === KIND_CJ_LETTER) || @@ -136,6 +142,7 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { ) { return true; } + // Do not convert it to Space when: if ( // Shall not be converted to Space around CJK punctuation @@ -147,6 +154,7 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { ) { return false; } + const characterBefore = adjacentNodes.previous.value.at(-1); const characterAfter = adjacentNodes.next.value[0]; @@ -162,6 +170,7 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { ) { return true; } + // Converting newline between CJ and non-ASCII punctuation to Space does not seem to be better in many cases. (PR welcome) // e.g. // 1. “ア〜エの中から1つ選べ。” @@ -174,6 +183,7 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { ) { return false; } + // If the sentence uses the style that Space is injected in between CJ and alphanumerics, "\n" can be converted to Space. return isInSentenceWithCJSpaces(path); } @@ -218,19 +228,21 @@ function isWesternOrKoreanLetter(kind) { * @returns {boolean} `true` if “whitespace” can be converted to `"\n"` */ function isBreakable(path, value, options, adjacentNodes) { - if (options.proseWrap !== "always") { - return false; - } - if (getAncestorNode(path, SINGLE_LINE_NODE_TYPES)) { + if ( + options.proseWrap !== "always" || + getAncestorNode(path, SINGLE_LINE_NODE_TYPES) + ) { return false; } - if (adjacentNodes === undefined) { - return true; - } - // Space & newline are always breakable - if (value !== "") { + + if ( + adjacentNodes === undefined || + // Space & newline are always breakable + value !== "" + ) { return true; } + // Simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) // [Latin][""][Hangul] & vice versa => Don't break // [han & kana][""][hangul], either @@ -242,16 +254,19 @@ function isBreakable(path, value, options, adjacentNodes) { ) { return false; } + // https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages - const isBreakingCJKLineBreakingRule = + const violatesCJKLineBreakingRule = (adjacentNodes.next?.value !== undefined && noBreakBefore.has(adjacentNodes.next?.value?.[0])) || (adjacentNodes.previous?.value !== undefined && noBreakAfter.has(adjacentNodes.previous?.value?.at(-1))); + // For "" (CJK and some non-space) higher priority than the following rule - if (isBreakingCJKLineBreakingRule) { + if (violatesCJKLineBreakingRule) { return false; } + return true; } @@ -272,6 +287,7 @@ function printWhitespace(path, value, options, adjacentNodes) { if (value !== "\n") { return convertToLineIfBreakable(value); } + // Chinese and Japanese does not use U+0020 Space to divide words, so U+000A End of Line must not be replaced with it. // Behavior in other languages will not be changed because there are too much things to consider. (PR welcome) // e.g. Word segmentation in Thai etc. From 0d084aca397c8cf2ea8a45dbd242d05b762a7d17 Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Mon, 10 Oct 2022 17:59:59 +0300 Subject: [PATCH 60/87] Refactor: sentence is always the parent of word, move functions to utils --- src/language-markdown/print-whitespace.js | 6 +++--- src/language-markdown/printer-markdown.js | 24 ++--------------------- src/language-markdown/utils.js | 22 +++++++++++++++++++++ 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index c275bcebd517..7f348020af87 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -1,10 +1,10 @@ import { hardline, line, softline } from "../document/builders.js"; -import { getAncestorNode } from "./printer-markdown.js"; import { KIND_CJK_PUNCTUATION, KIND_CJ_LETTER, KIND_K_LETTER, KIND_NON_CJK, + getAncestorNode, punctuationRegex, } from "./utils.js"; @@ -49,11 +49,11 @@ const lineBreakBetweenTheseAndCJKConvertsToSpace = new Set( * (including ideograph aka han or kanji e.g. `字`, hiragana e.g. `あ`, and katakana e.g. `ア`) * and other letters (including alphanumerics; e.g. `A` or `1`) in the sentence. * - * @param {*} path current position in nodes tree + * @param {AstPath} path current position in nodes tree * @returns {boolean} `true` if Space is tend to be inserted between these types of letters, `false` otherwise. */ function isInSentenceWithCJSpaces(path) { - const sentenceNode = getAncestorNode(path, "sentence"); + const sentenceNode = path.parent; if (sentenceNode.usesCJSpaces !== undefined) { return sentenceNode.usesCJSpaces; } diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index cb3ca9c85483..dc0819b101a6 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -36,6 +36,8 @@ import { INLINE_NODE_TYPES, INLINE_NODE_WRAPPER_TYPES, isAutolink, + getAncestorNode, + getAncestorCounter, } from "./utils.js"; import visitorKeys from "./visitor-keys.js"; import { printWhitespace } from "./print-whitespace.js"; @@ -510,26 +512,6 @@ function getNthSiblingIndex(node, parentNode, condition) { } } -function getAncestorCounter(path, typeOrTypes) { - const types = Array.isArray(typeOrTypes) ? typeOrTypes : [typeOrTypes]; - - let counter = -1; - let ancestorNode; - - while ((ancestorNode = path.getParentNode(++counter))) { - if (types.includes(ancestorNode.type)) { - return counter; - } - } - - return -1; -} - -function getAncestorNode(path, typeOrTypes) { - const counter = getAncestorCounter(path, typeOrTypes); - return counter === -1 ? null : path.getParentNode(counter); -} - function printTable(path, options, print) { const { node } = path; @@ -893,8 +875,6 @@ function hasPrettierIgnore(path) { return path.index > 0 && isPrettierIgnore(path.previous) === "next"; } -export { getAncestorNode }; - const printer = { preprocess, print: genericPrint, diff --git a/src/language-markdown/utils.js b/src/language-markdown/utils.js index 5ba67a137063..a7928edcdf89 100644 --- a/src/language-markdown/utils.js +++ b/src/language-markdown/utils.js @@ -245,6 +245,26 @@ function isAutolink(node) { return locStart(node) === locStart(child) && locEnd(node) === locEnd(child); } +function getAncestorCounter(path, typeOrTypes) { + const types = Array.isArray(typeOrTypes) ? typeOrTypes : [typeOrTypes]; + + let counter = -1; + let ancestorNode; + + while ((ancestorNode = path.getParentNode(++counter))) { + if (types.includes(ancestorNode.type)) { + return counter; + } + } + + return -1; +} + +function getAncestorNode(path, typeOrTypes) { + const counter = getAncestorCounter(path, typeOrTypes); + return counter === -1 ? null : path.getParentNode(counter); +} + export { mapAst, splitText, @@ -260,4 +280,6 @@ export { KIND_CJ_LETTER, KIND_K_LETTER, KIND_CJK_PUNCTUATION, + getAncestorCounter, + getAncestorNode, }; From 124677a66be129aae20e6b5c4717406d12e01ff1 Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Tue, 11 Oct 2022 12:19:16 +0300 Subject: [PATCH 61/87] Edit changelog entry --- changelog_unreleased/markdown/11597.md | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/changelog_unreleased/markdown/11597.md b/changelog_unreleased/markdown/11597.md index 608fb0174c3c..58448064c476 100644 --- a/changelog_unreleased/markdown/11597.md +++ b/changelog_unreleased/markdown/11597.md @@ -2,7 +2,7 @@ ##### Stop inserting spaces between Chinese or Japanese and Western characters -Previously, Prettier would insert spaces between Chinese or Japanese and Western characters (letters and digits). While some people prefer this style, it isn’t standard, and is in fact contrary to official guidelines. Please see [here](https://github.com/tats-u/prettier-plugin-md-nocjsp#why-this-plugin-is-needed) for more details. We decided it’s not Prettier’s job to enforce a particular style in this case, so spaces aren’t inserted anymore, while existing ones are preserved. If you need a tool for enforcing spacing style, consider (textlint-ja)[https://github.com/textlint-ja/textlint-rule-preset-ja-spacing/tree/master/packages/textlint-rule-ja-space-between-half-and-full-width]. +Previously, Prettier would insert spaces between Chinese or Japanese and Western characters (letters and digits). While some people prefer this style, it isn’t standard, and is in fact contrary to official guidelines. Please see [here](https://github.com/tats-u/prettier-plugin-md-nocjsp#why-this-plugin-is-needed) for more details. We decided it’s not Prettier’s job to enforce a particular style in this case, so spaces aren’t inserted anymore, while existing ones are preserved. If you need a tool for enforcing spacing style, consider [textlint-ja](https://github.com/textlint-ja/textlint-rule-preset-ja-spacing/tree/master/packages/textlint-rule-ja-space-between-half-and-full-width) or [lint-md](https://github.com/lint-md/lint-md) (rules `space-round-alphabet` and `space-round-number`). The tricky part of this change were ambiguous line breaks between Chinese or Japanese and Western characters. When Prettier unwraps text, it needs to decide whether such a line break should be simply removed or replaced with a space. For that Prettier examines the surrounding text and infers the preferred style. @@ -27,7 +27,7 @@ Alphabetsひらがな12345カタカナ67890 ##### Comply to line breaking rules in Chinese and Japanese -There are rules that prohibit certain characters from appearing at the beginning or the end of a line in [Chinese](https://www.w3.org/TR/clreq/#prohibition_rules_for_line_start_end) and [Japanese](https://www.w3.org/TR/jlreq/#characters_not_starting_a_line). E.g., full stop characters `。`, `.`, and `.` shouldn’t start a line. +There are rules that prohibit certain characters from appearing at the beginning or the end of a line in [Chinese](https://www.w3.org/TR/clreq/#prohibition_rules_for_line_start_end) and [Japanese](https://www.w3.org/TR/jlreq/#characters_not_starting_a_line). E.g., full stop characters `。`, `.`, and `.` shouldn’t start a line whereas `(` shouldn’t end a line. Prettier now follows these rules when it wraps text, that is when `proseWrap` is set to `always`. ```markdown @@ -68,13 +68,6 @@ HTCPCPの だ。 ``` -In the example above: - -- `ィ`, `ッ`, `)`, `ー`, `、`, and `。` must not be placed at the beginning of a line. -- `(` must not be placed at the end of a line. - -Prettier now follows these rules when it wraps text, that is when `proseWrap` is set to `always`. However, if the input text violates these rules, Prettier doesn’t try to fix that and considers the whitespace intentional. In particular, that might happen if the text was formatted with an older version of Prettier, in which case you might need to correct those issues by hand or with another tool. - ##### Do not break lines inside Korean words Korean uses spaces to divide words, and an inappropriate division may change the meaning of a sentence: From 1d14dad0e6c15180b9964085bcb85d4d659f22a1 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Tue, 11 Oct 2022 23:59:16 +0900 Subject: [PATCH 62/87] Make sure to correct intentional violation of line breaking rules --- src/language-markdown/print-whitespace.js | 12 ++++++++---- .../splitCjkText/__snapshots__/jsfmt.spec.js.snap | 6 ++++++ .../markdown/splitCjkText/symbolSpaceNewLine.md | 3 +++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index 7f348020af87..595906f0da6d 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -237,8 +237,8 @@ function isBreakable(path, value, options, adjacentNodes) { if ( adjacentNodes === undefined || - // Space & newline are always breakable - value !== "" + // Space are always breakable + value === " " ) { return true; } @@ -247,7 +247,8 @@ function isBreakable(path, value, options, adjacentNodes) { // [Latin][""][Hangul] & vice versa => Don't break // [han & kana][""][hangul], either if ( - (adjacentNodes.previous?.kind === KIND_K_LETTER && + (value === "" && + adjacentNodes.previous?.kind === KIND_K_LETTER && isLetter(adjacentNodes.next?.kind)) || (adjacentNodes.next?.kind === KIND_K_LETTER && isLetter(adjacentNodes.previous?.kind)) @@ -262,7 +263,10 @@ function isBreakable(path, value, options, adjacentNodes) { (adjacentNodes.previous?.value !== undefined && noBreakAfter.has(adjacentNodes.previous?.value?.at(-1))); - // For "" (CJK and some non-space) higher priority than the following rule + // Targets not only "" but also "\n" + // Intentional violation of the line breaking rules (e.g. “ル\nール守れ\n!”) tends to be “corrected” ("\n" -> "") by formatted with a large value of `printWidth`. + // Eventually, by reformatted with a smaller value of `printWidth` or because of a paragraph revision, the rules are going to be applied to the place that used to violate them. + // e.g. “シ\nョ\nートカ\nット\n!\n?” (completely violates the rules on purpose) --[printWidth = 6]-->“ショー\nトカッ\nト!?” --[s/^/こんなところで/]--> “こんな\nところ\nで\nショー\nトカッ\nト!?” (completely complies to the rules) if (violatesCJKLineBreakingRule) { return false; } diff --git a/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap b/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap index e3a21ef5b902..5ee3f96cd18d 100644 --- a/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap +++ b/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap @@ -319,6 +319,9 @@ English ・ 中点 +禁則処理にわざと違反した文章のテストを今から行います。準備はいいでしょうか?レデ +ィ、ゴー! + =====================================output===================================== 日本語、にほんご。汉语, 中文. 日本語,にほんご.English words!? 漢字!汉字?「セ リフ」(括弧) 文字(括弧)文字【括弧】日本語English @@ -334,5 +337,8 @@ English 中点・中点 +禁則処理にわざと違反した文章のテストを今から行います。準備はいいでしょうか?レ +ディ、ゴー! + ================================================================================ `; diff --git a/tests/format/markdown/splitCjkText/symbolSpaceNewLine.md b/tests/format/markdown/splitCjkText/symbolSpaceNewLine.md index f6e277e7cf82..1b73a374628a 100644 --- a/tests/format/markdown/splitCjkText/symbolSpaceNewLine.md +++ b/tests/format/markdown/splitCjkText/symbolSpaceNewLine.md @@ -48,3 +48,6 @@ English 中点 ・ 中点 + +禁則処理にわざと違反した文章のテストを今から行います。準備はいいでしょうか?レデ +ィ、ゴー! From 7fc7e911f604a67731c68e03ce4e83351b8c43c4 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Fri, 14 Oct 2022 00:20:31 +0900 Subject: [PATCH 63/87] Mitigate join to divide loop --- src/language-markdown/print-whitespace.js | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index 595906f0da6d..b4bb219f4318 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -256,6 +256,11 @@ function isBreakable(path, value, options, adjacentNodes) { return false; } + // If `false`, joined with `" "` -> divided into 2 lines by `" "` loop occurs + if (value === "\n" && canBeConvertedToSpace(path, value, adjacentNodes)) { + return true; + } + // https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages const violatesCJKLineBreakingRule = (adjacentNodes.next?.value !== undefined && From d0df9e36adc99e6600e16a0da4f422a90e27bb5c Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Fri, 14 Oct 2022 23:11:22 +0900 Subject: [PATCH 64/87] Fix unintended incorrect condition --- src/language-markdown/print-whitespace.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index b4bb219f4318..96e2ff0f0b95 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -247,11 +247,11 @@ function isBreakable(path, value, options, adjacentNodes) { // [Latin][""][Hangul] & vice versa => Don't break // [han & kana][""][hangul], either if ( - (value === "" && - adjacentNodes.previous?.kind === KIND_K_LETTER && + value === "" && + ((adjacentNodes.previous?.kind === KIND_K_LETTER && isLetter(adjacentNodes.next?.kind)) || - (adjacentNodes.next?.kind === KIND_K_LETTER && - isLetter(adjacentNodes.previous?.kind)) + (adjacentNodes.next?.kind === KIND_K_LETTER && + isLetter(adjacentNodes.previous?.kind))) ) { return false; } From 27590d10da8cb19304b3b31814c39102539096b8 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Fri, 14 Oct 2022 23:14:48 +0900 Subject: [PATCH 65/87] Use `has{Leading,Trailing}Punctuation` --- src/language-markdown/print-whitespace.js | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index 96e2ff0f0b95..941b54553651 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -5,7 +5,6 @@ import { KIND_K_LETTER, KIND_NON_CJK, getAncestorNode, - punctuationRegex, } from "./utils.js"; const SINGLE_LINE_NODE_TYPES = ["heading", "tableCell", "link", "wikiLink"]; @@ -178,8 +177,8 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { // 2. “これはひどい……なんと汚いコミットログなんだ……” // "…" (U+2026) belongs to Po, and "\n" in "これはひどい……\nなんと汚いコミットログなんだ……" must not, either. if ( - punctuationRegex.test(characterBefore) || - punctuationRegex.test(characterAfter) + adjacentNodes.previous.hasTrailingPunctuation || + adjacentNodes.next.hasLeadingPunctuation ) { return false; } From 70283e74e8b98c35f473ddc6fb18f7915bd06aa3 Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Fri, 14 Oct 2022 20:41:37 +0300 Subject: [PATCH 66/87] Minor refactoring --- src/language-markdown/print-whitespace.js | 6 +++--- src/language-markdown/utils.js | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index 941b54553651..4d5c834eabe6 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -61,14 +61,14 @@ function isInSentenceWithCJSpaces(path) { " ": 0, "": 0, }; - for (let i = 0; i < sentenceNode.children.length; ++i) { + for (let i = 1; i < sentenceNode.children.length - 1; ++i) { const node = sentenceNode.children[i]; if ( node.type === "whitespace" && (node.value === " " || node.value === "") ) { - const previousKind = sentenceNode.children[i - 1]?.kind; - const nextKind = sentenceNode.children[i + 1]?.kind; + const previousKind = sentenceNode.children[i - 1].kind; + const nextKind = sentenceNode.children[i + 1].kind; if ( (previousKind === "cj-letter" && nextKind === "non-cjk") || (previousKind === "non-cjk" && nextKind === "cj-letter") diff --git a/src/language-markdown/utils.js b/src/language-markdown/utils.js index a7928edcdf89..32643f3a6e73 100644 --- a/src/language-markdown/utils.js +++ b/src/language-markdown/utils.js @@ -269,7 +269,6 @@ export { mapAst, splitText, punctuationPattern, - punctuationRegex, getFencedCodeBlockValue, getOrderedListItemInfo, hasGitDiffFriendlyOrderedList, From d8b75b1bfec8d5fa81ef4e6e948969310d5d2268 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 15 Oct 2022 19:37:03 +0900 Subject: [PATCH 67/87] Fix footnote label formatting regression --- src/language-markdown/printer-markdown.js | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index dc0819b101a6..b8e07e1fe206 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -64,12 +64,19 @@ function genericPrint(path, options, print) { node.position.start.offset, node.position.end.offset ) - ).map((node) => + ).map((node, i, nodes) => node.type === "word" ? node.value : node.value === "" ? "" - : printWhitespace(path, node.value, options) + : printWhitespace(path, node.value, options, { + // The nodes next to `WhitespaceNode` are guaranteed to be `WordNode` + // `prettier-ignore` is required due to https://github.com/prettier/prettier/issues/8171 + // prettier-ignore + previous: /** @type {import("./utils.js").WordNode} */ (nodes[i - 1]), + // prettier-ignore + next: /** @type {import("./utils.js").WordNode} */ (nodes[i + 1]), + }) ); } From 90e21442dc1955c3cd1add6830ece537ec2d4246 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 15 Oct 2022 20:45:17 +0900 Subject: [PATCH 68/87] Add more test cases --- .../__snapshots__/jsfmt.spec.js.snap | 18 ++++++++++++++++++ tests/format/markdown/splitCjkText/korean.md | 2 ++ .../splitCjkText/symbolSpaceNewLine.md | 9 +++++++++ 3 files changed, 29 insertions(+) diff --git a/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap b/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap index 5ee3f96cd18d..13869fe3acf0 100644 --- a/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap +++ b/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap @@ -170,6 +170,8 @@ NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. 과 尹錫悅 과 음... +역대 대통령은 李承晩과 許政과 尹潽善과 朴正熙과 崔圭夏과 朴忠勳과 全斗煥과 盧泰愚과 金泳三과 金大中과 盧武鉉과 李明博과 朴槿惠과 文在寅과 尹錫悅과 음... 역대 대통령은 李承晩과 許政과 尹潽善과 朴正熙과 崔圭夏과 朴忠勳과 全斗煥과 盧泰愚과 金泳三과 金大中과 盧武鉉과 李明博과 朴槿惠과 文在寅과 尹錫悅과 음... + =====================================output===================================== 예문Latin예문Latin 예문Latin예문 Latin예문Latin 예문 Latin 예문 @@ -196,6 +198,11 @@ NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. 全斗煥 과 盧泰愚 과 金泳三 과 金大中 과 盧武鉉 과 李明博 과 朴槿惠 과 文在寅 과 尹錫悅 과 음... +역대 대통령은 李承晩과 許政과 尹潽善과 朴正熙과 崔圭夏과 朴忠勳과 全斗煥과 盧泰 +愚과 金泳三과 金大中과 盧武鉉과 李明博과 朴槿惠과 文在寅과 尹錫悅과 음... 역대 +대통령은 李承晩과 許政과 尹潽善과 朴正熙과 崔圭夏과 朴忠勳과 全斗煥과 盧泰愚과 +金泳三과 金大中과 盧武鉉과 李明博과 朴槿惠과 文在寅과 尹錫悅과 음... + ================================================================================ `; @@ -322,6 +329,15 @@ English 禁則処理にわざと違反した文章のテストを今から行います。準備はいいでしょうか?レデ ィ、ゴー! +[ウ +ィキペディアウ +ィキペディアウ +ィキペディアウ +ィキペディアウ +ィキペディアウ +ィキペディアウ +ィキペディア] + =====================================output===================================== 日本語、にほんご。汉语, 中文. 日本語,にほんご.English words!? 漢字!汉字?「セ リフ」(括弧) 文字(括弧)文字【括弧】日本語English @@ -340,5 +356,7 @@ English 禁則処理にわざと違反した文章のテストを今から行います。準備はいいでしょうか?レ ディ、ゴー! +[ウィキペディアウィキペディアウィキペディアウィキペディアウィキペディアウィキペディアウィキペディア] + ================================================================================ `; diff --git a/tests/format/markdown/splitCjkText/korean.md b/tests/format/markdown/splitCjkText/korean.md index 4713c34243f7..ba7608861d30 100644 --- a/tests/format/markdown/splitCjkText/korean.md +++ b/tests/format/markdown/splitCjkText/korean.md @@ -58,3 +58,5 @@ NVIDIA의 RTX3000 시리즈는 삼성전자와 TSMC에 의해 제조된다. 과 文在寅 과 尹錫悅 과 음... + +역대 대통령은 李承晩과 許政과 尹潽善과 朴正熙과 崔圭夏과 朴忠勳과 全斗煥과 盧泰愚과 金泳三과 金大中과 盧武鉉과 李明博과 朴槿惠과 文在寅과 尹錫悅과 음... 역대 대통령은 李承晩과 許政과 尹潽善과 朴正熙과 崔圭夏과 朴忠勳과 全斗煥과 盧泰愚과 金泳三과 金大中과 盧武鉉과 李明博과 朴槿惠과 文在寅과 尹錫悅과 음... diff --git a/tests/format/markdown/splitCjkText/symbolSpaceNewLine.md b/tests/format/markdown/splitCjkText/symbolSpaceNewLine.md index 1b73a374628a..7facace6616e 100644 --- a/tests/format/markdown/splitCjkText/symbolSpaceNewLine.md +++ b/tests/format/markdown/splitCjkText/symbolSpaceNewLine.md @@ -51,3 +51,12 @@ English 禁則処理にわざと違反した文章のテストを今から行います。準備はいいでしょうか?レデ ィ、ゴー! + +[ウ +ィキペディアウ +ィキペディアウ +ィキペディアウ +ィキペディアウ +ィキペディアウ +ィキペディアウ +ィキペディア] From d26de28b34cf7148c4e0f74e010ab6962ef8277c Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 15 Oct 2022 21:37:45 +0900 Subject: [PATCH 69/87] Call `canBeConvertedToSpace` at most once --- src/language-markdown/print-whitespace.js | 46 +++++++++++++---------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index 4d5c834eabe6..925fd9745270 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -104,7 +104,7 @@ function isInSentenceWithCJSpaces(path) { * @param {AstPath} path path of given node * @param {WhitespaceValue} value value of given node (typically `" "` or `"\n"`) * @param {AdjacentNodes | undefined} adjacentNodes adjacent sibling nodes of given node - * @returns {boolean} `true` if given node can be converted to space, `false` if not (i.e. newline or empty character) + * @returns {boolean} `true` if given node can be convertedIoSpace, `false` if not (i.e. newline or empty character) */ function canBeConvertedToSpace(path, value, adjacentNodes) { // "\n" or " ", of course " " always can be converted to Space @@ -147,8 +147,8 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { // Shall not be converted to Space around CJK punctuation previousKind === KIND_CJK_PUNCTUATION || nextKind === KIND_CJK_PUNCTUATION || - // "\n" between CJ always SHALL NOT be converted to space - // "\n" between Korean and CJ is better not to be converted to space + // "\n" between CJ always SHALL NOT be convertedIoSpace + // "\n" between Korean and CJ is better not to be convertedIoSpace (isCJK(previousKind) && isCJK(nextKind)) ) { return false; @@ -224,7 +224,7 @@ function isWesternOrKoreanLetter(kind) { * @param {WhitespaceValue} value * @param {*} options * @param {AdjacentNodes | undefined} [adjacentNodes] - * @returns {boolean} `true` if “whitespace” can be converted to `"\n"` + * @returns {boolean | "trueIfSpace"} `true` if “whitespace” can be converted to `"\n"`; `trueIfSpace` equals to true only if `canBeConvertedToSpace` retuns `true` */ function isBreakable(path, value, options, adjacentNodes) { if ( @@ -255,11 +255,6 @@ function isBreakable(path, value, options, adjacentNodes) { return false; } - // If `false`, joined with `" "` -> divided into 2 lines by `" "` loop occurs - if (value === "\n" && canBeConvertedToSpace(path, value, adjacentNodes)) { - return true; - } - // https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages const violatesCJKLineBreakingRule = (adjacentNodes.next?.value !== undefined && @@ -272,7 +267,8 @@ function isBreakable(path, value, options, adjacentNodes) { // Eventually, by reformatted with a smaller value of `printWidth` or because of a paragraph revision, the rules are going to be applied to the place that used to violate them. // e.g. “シ\nョ\nートカ\nット\n!\n?” (completely violates the rules on purpose) --[printWidth = 6]-->“ショー\nトカッ\nト!?” --[s/^/こんなところで/]--> “こんな\nところ\nで\nショー\nトカッ\nト!?” (completely complies to the rules) if (violatesCJKLineBreakingRule) { - return false; + // If `false` even Space, joined with `" "` -> divided into 2 lines by `" "` loop occurs + return value === "\n" ? "trueIfSpace" : false; } return true; @@ -293,25 +289,35 @@ function printWhitespace(path, value, options, adjacentNodes) { // Space or empty if (value !== "\n") { - return convertToLineIfBreakable(value); + return convertToLineIfBreakable(value, isBreakable_); } // Chinese and Japanese does not use U+0020 Space to divide words, so U+000A End of Line must not be replaced with it. // Behavior in other languages will not be changed because there are too much things to consider. (PR welcome) // e.g. Word segmentation in Thai etc. return convertToLineIfBreakable( - canBeConvertedToSpace(path, value, adjacentNodes) ? " " : "" + canBeConvertedToSpace(path, value, adjacentNodes) ? " " : "", + isBreakable_ ); +} - /** - * @param value {" " | ""} - */ - function convertToLineIfBreakable(value) { - if (!isBreakable_) { - return value; - } - return value === " " ? line : softline; +/** + * @param {" " | ""} value + * @param {ReturnType} isBreakable_ + */ +function convertToLineIfBreakable(value, isBreakable_) { + if (!isBreakable_) { + return value; + } + if (value === " ") { + return line; + } + // value === "" + if (isBreakable_ === "trueIfSpace") { + return ""; } + // isBreakable === true + return softline; } export { printWhitespace }; From 3f72bb0dd83af024ce811d63ed212f3ff043c864 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sat, 15 Oct 2022 22:27:04 +0900 Subject: [PATCH 70/87] Remove unused argument from `canBeConvertedToSpace` --- src/language-markdown/print-whitespace.js | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index 925fd9745270..9d7f9a71bb8f 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -93,7 +93,7 @@ function isInSentenceWithCJSpaces(path) { */ /** - * Checks if given node can be converted to Space + * Checks if given `"\n"` node can be converted to Space * * For example, if you would like to squash the multi-line string `"You might want\nto use Prettier."` into a single line, * you would replace `"\n"` with `" "`. (`"You might want to use Prettier."`) @@ -102,16 +102,10 @@ function isInSentenceWithCJSpaces(path) { * Behavior in other languages (e.g. Thai) will not be changed because there are too much things to consider. (PR welcome) * * @param {AstPath} path path of given node - * @param {WhitespaceValue} value value of given node (typically `" "` or `"\n"`) * @param {AdjacentNodes | undefined} adjacentNodes adjacent sibling nodes of given node - * @returns {boolean} `true` if given node can be convertedIoSpace, `false` if not (i.e. newline or empty character) + * @returns {boolean} `true` if given node can be convertedToSpace, `false` if not (i.e. newline or empty character) */ -function canBeConvertedToSpace(path, value, adjacentNodes) { - // "\n" or " ", of course " " always can be converted to Space - if (value !== "\n") { - return true; - } - +function canBeConvertedToSpace(path, adjacentNodes) { // no adjacent nodes if (!adjacentNodes) { return true; @@ -147,8 +141,8 @@ function canBeConvertedToSpace(path, value, adjacentNodes) { // Shall not be converted to Space around CJK punctuation previousKind === KIND_CJK_PUNCTUATION || nextKind === KIND_CJK_PUNCTUATION || - // "\n" between CJ always SHALL NOT be convertedIoSpace - // "\n" between Korean and CJ is better not to be convertedIoSpace + // "\n" between CJ always SHALL NOT be convertedToSpace + // "\n" between Korean and CJ is better not to be convertedToSpace (isCJK(previousKind) && isCJK(nextKind)) ) { return false; @@ -224,7 +218,7 @@ function isWesternOrKoreanLetter(kind) { * @param {WhitespaceValue} value * @param {*} options * @param {AdjacentNodes | undefined} [adjacentNodes] - * @returns {boolean | "trueIfSpace"} `true` if “whitespace” can be converted to `"\n"`; `trueIfSpace` equals to true only if `canBeConvertedToSpace` retuns `true` + * @returns {boolean | "trueIfSpace"} `true` if “whitespace” can be converted to `"\n"`; `trueIfSpace` equals to true only if `canBeConvertedToSpace` returns `true` */ function isBreakable(path, value, options, adjacentNodes) { if ( @@ -296,7 +290,7 @@ function printWhitespace(path, value, options, adjacentNodes) { // Behavior in other languages will not be changed because there are too much things to consider. (PR welcome) // e.g. Word segmentation in Thai etc. return convertToLineIfBreakable( - canBeConvertedToSpace(path, value, adjacentNodes) ? " " : "", + canBeConvertedToSpace(path, adjacentNodes) ? " " : "", isBreakable_ ); } From 5e4bf056cbf1f39e4af6a61fb26328a3b72a0096 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 16 Oct 2022 14:27:20 +0900 Subject: [PATCH 71/87] Permit conversion from newline to space in `linkReference` --- src/language-markdown/printer-markdown.js | 16 +++++++--------- .../__snapshots__/jsfmt.spec.js.snap | 13 ++++++------- .../markdown/splitCjkText/symbolSpaceNewLine.md | 9 +++------ 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index b8e07e1fe206..e3e1498fe54a 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -64,19 +64,17 @@ function genericPrint(path, options, print) { node.position.start.offset, node.position.end.offset ) - ).map((node, i, nodes) => + ).map((node) => node.type === "word" ? node.value : node.value === "" ? "" - : printWhitespace(path, node.value, options, { - // The nodes next to `WhitespaceNode` are guaranteed to be `WordNode` - // `prettier-ignore` is required due to https://github.com/prettier/prettier/issues/8171 - // prettier-ignore - previous: /** @type {import("./utils.js").WordNode} */ (nodes[i - 1]), - // prettier-ignore - next: /** @type {import("./utils.js").WordNode} */ (nodes[i + 1]), - }) + : printWhitespace( + path, + node.value, + options + // `Unset adjacentNodes` because the deep equality test will fail if set + ) ); } diff --git a/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap b/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap index 13869fe3acf0..4b68052c1481 100644 --- a/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap +++ b/tests/format/markdown/splitCjkText/__snapshots__/jsfmt.spec.js.snap @@ -330,14 +330,11 @@ English ィ、ゴー! [ウ -ィキペディアウ -ィキペディアウ -ィキペディアウ -ィキペディアウ -ィキペディアウ -ィキペディアウ ィキペディア] +[ウ +ィキペディア]: https://ja.wikipedia.org/ + =====================================output===================================== 日本語、にほんご。汉语, 中文. 日本語,にほんご.English words!? 漢字!汉字?「セ リフ」(括弧) 文字(括弧)文字【括弧】日本語English @@ -356,7 +353,9 @@ English 禁則処理にわざと違反した文章のテストを今から行います。準備はいいでしょうか?レ ディ、ゴー! -[ウィキペディアウィキペディアウィキペディアウィキペディアウィキペディアウィキペディアウィキペディア] +[ウ ィキペディア] + +[ウ ィキペディア]: https://ja.wikipedia.org/ ================================================================================ `; diff --git a/tests/format/markdown/splitCjkText/symbolSpaceNewLine.md b/tests/format/markdown/splitCjkText/symbolSpaceNewLine.md index 7facace6616e..f46aaee81100 100644 --- a/tests/format/markdown/splitCjkText/symbolSpaceNewLine.md +++ b/tests/format/markdown/splitCjkText/symbolSpaceNewLine.md @@ -53,10 +53,7 @@ English ィ、ゴー! [ウ -ィキペディアウ -ィキペディアウ -ィキペディアウ -ィキペディアウ -ィキペディアウ -ィキペディアウ ィキペディア] + +[ウ +ィキペディア]: https://ja.wikipedia.org/ From 526b1fe35d6f401a39316c3399b509fa6542cc2e Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Mon, 17 Oct 2022 00:13:54 +0300 Subject: [PATCH 72/87] Minor tweaks --- src/language-markdown/print-whitespace.js | 55 +++++++++++------------ 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index 9d7f9a71bb8f..41069ca0b21c 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -44,12 +44,12 @@ const lineBreakBetweenTheseAndCJKConvertsToSpace = new Set( ); /** - * Finds out if Space is tend to be inserted between Chinese or Japanese characters + * Finds out if Space tends to be inserted between Chinese or Japanese characters * (including ideograph aka han or kanji e.g. `字`, hiragana e.g. `あ`, and katakana e.g. `ア`) * and other letters (including alphanumerics; e.g. `A` or `1`) in the sentence. * * @param {AstPath} path current position in nodes tree - * @returns {boolean} `true` if Space is tend to be inserted between these types of letters, `false` otherwise. + * @returns {boolean} `true` if Space tends to be inserted between these types of letters, `false` otherwise. */ function isInSentenceWithCJSpaces(path) { const sentenceNode = path.parent; @@ -57,10 +57,8 @@ function isInSentenceWithCJSpaces(path) { return sentenceNode.usesCJSpaces; } - const cjNonCJKSpacingStatistics = { - " ": 0, - "": 0, - }; + const stats = { " ": 0, "": 0 }; + for (let i = 1; i < sentenceNode.children.length - 1; ++i) { const node = sentenceNode.children[i]; if ( @@ -70,16 +68,16 @@ function isInSentenceWithCJSpaces(path) { const previousKind = sentenceNode.children[i - 1].kind; const nextKind = sentenceNode.children[i + 1].kind; if ( - (previousKind === "cj-letter" && nextKind === "non-cjk") || - (previousKind === "non-cjk" && nextKind === "cj-letter") + (previousKind === KIND_CJ_LETTER && nextKind === KIND_NON_CJK) || + (previousKind === KIND_NON_CJK && nextKind === KIND_CJ_LETTER) ) { - ++cjNonCJKSpacingStatistics[node.value]; + ++stats[node.value]; } } } + // Injects a property to cache the result. - sentenceNode.usesCJSpaces = - cjNonCJKSpacingStatistics[" "] > cjNonCJKSpacingStatistics[""]; + sentenceNode.usesCJSpaces = stats[" "] > stats[""]; return sentenceNode.usesCJSpaces; } @@ -98,7 +96,7 @@ function isInSentenceWithCJSpaces(path) { * For example, if you would like to squash the multi-line string `"You might want\nto use Prettier."` into a single line, * you would replace `"\n"` with `" "`. (`"You might want to use Prettier."`) * - * However, you should note that Chinese and Japanese does not use U+0020 Space to divide words, so U+000A End of Line must not be replaced with it. + * However, you should note that Chinese and Japanese do not use U+0020 Space to divide words, so U+000A End of Line must not be replaced with it. * Behavior in other languages (e.g. Thai) will not be changed because there are too much things to consider. (PR welcome) * * @param {AstPath} path path of given node @@ -119,7 +117,7 @@ function canBeConvertedToSpace(path, adjacentNodes) { const previousKind = adjacentNodes.previous.kind; const nextKind = adjacentNodes.next.kind; - // "\n" between not western or Korean (han, kana, CJK punctuations) characters always can converted to Space + // "\n" between not western or Korean (han, kana, CJK punctuations) characters always can be converted to Space // Korean hangul simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) if ( isWesternOrKoreanLetter(previousKind) && @@ -177,7 +175,7 @@ function canBeConvertedToSpace(path, adjacentNodes) { return false; } - // If the sentence uses the style that Space is injected in between CJ and alphanumerics, "\n" can be converted to Space. + // If the sentence uses the style with spaces between CJ and alphanumerics, "\n" can be converted to Space. return isInSentenceWithCJSpaces(path); } @@ -212,13 +210,14 @@ function isWesternOrKoreanLetter(kind) { } /** - * Returns whether “whitespace” (`"" | " " | "\n"`; see `WhitespaceValue`) can converted to `"\n"` + * Returns whether “whitespace” (`"" | " " | "\n"`; see `WhitespaceValue`) can be converted to `"\n"` * * @param {AstPath} path * @param {WhitespaceValue} value * @param {*} options * @param {AdjacentNodes | undefined} [adjacentNodes] - * @returns {boolean | "trueIfSpace"} `true` if “whitespace” can be converted to `"\n"`; `trueIfSpace` equals to true only if `canBeConvertedToSpace` returns `true` + * @returns {boolean | "trueIfSpace"} `true` if “whitespace” can be converted to `"\n"`; + * `trueIfSpace` means it can be converted only if `canBeConvertedToSpace` returns `true` */ function isBreakable(path, value, options, adjacentNodes) { if ( @@ -251,10 +250,10 @@ function isBreakable(path, value, options, adjacentNodes) { // https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages const violatesCJKLineBreakingRule = - (adjacentNodes.next?.value !== undefined && - noBreakBefore.has(adjacentNodes.next?.value?.[0])) || - (adjacentNodes.previous?.value !== undefined && - noBreakAfter.has(adjacentNodes.previous?.value?.at(-1))); + (adjacentNodes.next !== undefined && + noBreakBefore.has(adjacentNodes.next.value[0])) || + (adjacentNodes.previous !== undefined && + noBreakAfter.has(adjacentNodes.previous.value.at(-1))); // Targets not only "" but also "\n" // Intentional violation of the line breaking rules (e.g. “ル\nール守れ\n!”) tends to be “corrected” ("\n" -> "") by formatted with a large value of `printWidth`. @@ -281,18 +280,14 @@ function printWhitespace(path, value, options, adjacentNodes) { const isBreakable_ = isBreakable(path, value, options, adjacentNodes); - // Space or empty - if (value !== "\n") { - return convertToLineIfBreakable(value, isBreakable_); + if (value === "\n") { + // Chinese and Japanese do not use U+0020 Space to divide words, so U+000A End of Line must not be replaced with it. + // Behavior in other languages will not be changed because there are too much things to consider. (PR welcome) + // e.g. Word segmentation in Thai etc. + value = canBeConvertedToSpace(path, adjacentNodes) ? " " : ""; } - // Chinese and Japanese does not use U+0020 Space to divide words, so U+000A End of Line must not be replaced with it. - // Behavior in other languages will not be changed because there are too much things to consider. (PR welcome) - // e.g. Word segmentation in Thai etc. - return convertToLineIfBreakable( - canBeConvertedToSpace(path, adjacentNodes) ? " " : "", - isBreakable_ - ); + return convertToLineIfBreakable(value, isBreakable_); } /** From 20e5393f2094078d6c6144bd30614726949014c0 Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Mon, 17 Oct 2022 00:25:00 +0300 Subject: [PATCH 73/87] Remove convertToLineIfBreakable --- src/language-markdown/print-whitespace.js | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index 41069ca0b21c..4ef2066b0ec7 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -287,24 +287,19 @@ function printWhitespace(path, value, options, adjacentNodes) { value = canBeConvertedToSpace(path, adjacentNodes) ? " " : ""; } - return convertToLineIfBreakable(value, isBreakable_); -} - -/** - * @param {" " | ""} value - * @param {ReturnType} isBreakable_ - */ -function convertToLineIfBreakable(value, isBreakable_) { if (!isBreakable_) { return value; } + if (value === " ") { return line; } + // value === "" if (isBreakable_ === "trueIfSpace") { return ""; } + // isBreakable === true return softline; } From b939f93cabe4f91c1831b83fa34517f3447636d9 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Tue, 18 Oct 2022 00:21:47 +0900 Subject: [PATCH 74/87] Fix comments --- src/language-markdown/print-whitespace.js | 11 +++++++---- src/language-markdown/printer-markdown.js | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index 4ef2066b0ec7..6aa825b17d77 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -140,7 +140,6 @@ function canBeConvertedToSpace(path, adjacentNodes) { previousKind === KIND_CJK_PUNCTUATION || nextKind === KIND_CJK_PUNCTUATION || // "\n" between CJ always SHALL NOT be convertedToSpace - // "\n" between Korean and CJ is better not to be convertedToSpace (isCJK(previousKind) && isCJK(nextKind)) ) { return false; @@ -255,12 +254,16 @@ function isBreakable(path, value, options, adjacentNodes) { (adjacentNodes.previous !== undefined && noBreakAfter.has(adjacentNodes.previous.value.at(-1))); - // Targets not only "" but also "\n" - // Intentional violation of the line breaking rules (e.g. “ル\nール守れ\n!”) tends to be “corrected” ("\n" -> "") by formatted with a large value of `printWidth`. + // When violates the CJK line breaking rules if breaks lines there ("") or leaves surrounding lines divided ("\n")... + // - "" => always `false` (i.e. disallows to break lines there) + // - "\n" => `true` (i.e. don't force surrounding lines to be joined) only when it should be converted to Space + // `false` (i.e. join surrounding lines into a single line) otherwise + // The mandatory joining behavior when Space is not allowed is necessary because intentional violation of the line breaking rules (e.g. “ル\nール守れ\n!”) tends to be “corrected” ("\n" -> "") by formatted with a large value of `printWidth`. // Eventually, by reformatted with a smaller value of `printWidth` or because of a paragraph revision, the rules are going to be applied to the place that used to violate them. // e.g. “シ\nョ\nートカ\nット\n!\n?” (completely violates the rules on purpose) --[printWidth = 6]-->“ショー\nトカッ\nト!?” --[s/^/こんなところで/]--> “こんな\nところ\nで\nショー\nトカッ\nト!?” (completely complies to the rules) + // On the contrary, if `false` even should be Space, the following loop will occur: + // the surrounding lines are joined with `" "` -> divided into 2 lines by `" "` -> joined again -> ... if (violatesCJKLineBreakingRule) { - // If `false` even Space, joined with `" "` -> divided into 2 lines by `" "` loop occurs return value === "\n" ? "trueIfSpace" : false; } diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index c512aba1a85d..bd9e5d63ad11 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -71,7 +71,7 @@ function genericPrint(path, options, print) { path, node.value, options - // `Unset adjacentNodes` because the deep equality test will fail if set + // Leave `adjacentNodes` unset because the deep equality test will fail if set ) ); } From b0e57ae085c22aa94822c9104362942e18f9a070 Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Mon, 17 Oct 2022 18:45:26 +0300 Subject: [PATCH 75/87] Refactoring: use assert, optional chaining --- src/language-markdown/utils.js | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/language-markdown/utils.js b/src/language-markdown/utils.js index 32643f3a6e73..0d4a24a66cb6 100644 --- a/src/language-markdown/utils.js +++ b/src/language-markdown/utils.js @@ -1,3 +1,4 @@ +import assert from "node:assert"; import { locStart, locEnd } from "./loc.js"; import { cjkPattern, @@ -135,14 +136,10 @@ function splitText(text) { // Check for `canBeConvertedToSpace` in ./whitespace.js etc. if (process.env.NODE_ENV !== "production") { for (let i = 1; i < nodes.length; i++) { - if ( - nodes[i].type === "whitespace" && - nodes[i - 1].type === "whitespace" - ) { - throw new Error( - "Internal error: splitText should not create consecutive whitespace nodes" - ); - } + assert( + !(nodes[i - 1].type === "whitespace" && nodes[i].type === "whitespace"), + "splitText should not create consecutive whitespace nodes" + ); } } @@ -151,8 +148,7 @@ function splitText(text) { function appendNode(node) { const lastNode = nodes.at(-1); if ( - lastNode && - lastNode.type === "word" && + lastNode?.type === "word" && !isBetween(KIND_NON_CJK, KIND_CJK_PUNCTUATION) && // disallow leading/trailing full-width whitespace ![lastNode.value, node.value].some((value) => /\u3000/.test(value)) From a9cf055ca7b9fa64c4751a21632ed334bda43c1b Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Mon, 17 Oct 2022 19:01:26 +0300 Subject: [PATCH 76/87] Refactor: remove isCJK --- src/language-markdown/print-whitespace.js | 14 +------------- src/language-markdown/utils.js | 2 +- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index 6aa825b17d77..20b7f4862ca6 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -140,7 +140,7 @@ function canBeConvertedToSpace(path, adjacentNodes) { previousKind === KIND_CJK_PUNCTUATION || nextKind === KIND_CJK_PUNCTUATION || // "\n" between CJ always SHALL NOT be convertedToSpace - (isCJK(previousKind) && isCJK(nextKind)) + (previousKind === KIND_CJ_LETTER && nextKind === KIND_CJ_LETTER) ) { return false; } @@ -178,18 +178,6 @@ function canBeConvertedToSpace(path, adjacentNodes) { return isInSentenceWithCJSpaces(path); } -/** - * @param {WordKind | undefined} kind - * @returns {boolean} `true` if `kind` is CJK (including punctuation marks) - */ -function isCJK(kind) { - return ( - kind === KIND_CJ_LETTER || - kind === KIND_K_LETTER || - kind === KIND_CJK_PUNCTUATION - ); -} - /** * @param {WordKind | undefined} kind * @returns {boolean} `true` if `kind` is letter (not CJK punctuation) diff --git a/src/language-markdown/utils.js b/src/language-markdown/utils.js index 0d4a24a66cb6..9ae051340273 100644 --- a/src/language-markdown/utils.js +++ b/src/language-markdown/utils.js @@ -133,7 +133,7 @@ function splitText(text) { } } - // Check for `canBeConvertedToSpace` in ./whitespace.js etc. + // Check for `canBeConvertedToSpace` in ./print-whitespace.js etc. if (process.env.NODE_ENV !== "production") { for (let i = 1; i < nodes.length; i++) { assert( From a366c9ea1e877f81de24da96ffc041d4605b99fd Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Mon, 17 Oct 2022 19:53:46 +0300 Subject: [PATCH 77/87] Add explanatory comment about link labels --- src/language-markdown/printer-markdown.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index bd9e5d63ad11..f355e6a16101 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -71,7 +71,9 @@ function genericPrint(path, options, print) { path, node.value, options - // Leave `adjacentNodes` unset because the deep equality test will fail if set + // Without the `adjacentNodes` argument `printWhitespace` wraps/unwraps + // text in such a way that the normalized form of a link label stays the same. + // See https://spec.commonmark.org/0.30/#matches ) ); } From 9331a2b997f9aeae026d8cf68bdc59700354e296 Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Mon, 17 Oct 2022 20:50:28 +0300 Subject: [PATCH 78/87] Refactor printWhitespace --- src/language-markdown/print-whitespace.js | 37 +++++++---------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index 20b7f4862ca6..f38aeda96df8 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -202,11 +202,11 @@ function isWesternOrKoreanLetter(kind) { * @param {AstPath} path * @param {WhitespaceValue} value * @param {*} options - * @param {AdjacentNodes | undefined} [adjacentNodes] - * @returns {boolean | "trueIfSpace"} `true` if “whitespace” can be converted to `"\n"`; - * `trueIfSpace` means it can be converted only if `canBeConvertedToSpace` returns `true` + * @param {AdjacentNodes | undefined} adjacentNodes + * @param {boolean} canBeSpace + * @returns {boolean} `true` if “whitespace” can be converted to `"\n"` */ -function isBreakable(path, value, options, adjacentNodes) { +function isBreakable(path, value, options, adjacentNodes, canBeSpace) { if ( options.proseWrap !== "always" || getAncestorNode(path, SINGLE_LINE_NODE_TYPES) @@ -252,7 +252,7 @@ function isBreakable(path, value, options, adjacentNodes) { // On the contrary, if `false` even should be Space, the following loop will occur: // the surrounding lines are joined with `" "` -> divided into 2 lines by `" "` -> joined again -> ... if (violatesCJKLineBreakingRule) { - return value === "\n" ? "trueIfSpace" : false; + return value === "\n" ? canBeSpace : false; } return true; @@ -269,30 +269,15 @@ function printWhitespace(path, value, options, adjacentNodes) { return hardline; } - const isBreakable_ = isBreakable(path, value, options, adjacentNodes); - - if (value === "\n") { - // Chinese and Japanese do not use U+0020 Space to divide words, so U+000A End of Line must not be replaced with it. - // Behavior in other languages will not be changed because there are too much things to consider. (PR welcome) - // e.g. Word segmentation in Thai etc. - value = canBeConvertedToSpace(path, adjacentNodes) ? " " : ""; - } - - if (!isBreakable_) { - return value; - } - - if (value === " ") { - return line; - } + const canBeSpace = + value === " " || + (value === "\n" && canBeConvertedToSpace(path, adjacentNodes)); - // value === "" - if (isBreakable_ === "trueIfSpace") { - return ""; + if (isBreakable(path, value, options, adjacentNodes, canBeSpace)) { + return canBeSpace ? line : softline; } - // isBreakable === true - return softline; + return canBeSpace ? " " : ""; } export { printWhitespace }; From 5962b464570a362a3efd968293ba122783bf4647 Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Mon, 17 Oct 2022 20:57:07 +0300 Subject: [PATCH 79/87] Minor refactoring --- src/language-markdown/print-whitespace.js | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index f38aeda96df8..216ed96a8899 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -84,7 +84,7 @@ function isInSentenceWithCJSpaces(path) { /** * @typedef {import("./utils.js").WordNode} WordNode * @typedef {import("./utils.js").WhitespaceValue} WhitespaceValue - * @typedef {{next?: WordNode | undefined | null, previous?: WordNode | undefined | null}} AdjacentNodes + * @typedef {{ next?: WordNode | null, previous?: WordNode | null }} AdjacentNodes * Adjacent node to `WhitespaceNode`. the consecution of `WhitespaceNode` is a bug, so adjacent nodes must be `WordNode`. * @typedef {import("./utils.js").WordKind} WordKind * @typedef {import("../common/ast-path.js").default} AstPath @@ -104,13 +104,13 @@ function isInSentenceWithCJSpaces(path) { * @returns {boolean} `true` if given node can be convertedToSpace, `false` if not (i.e. newline or empty character) */ function canBeConvertedToSpace(path, adjacentNodes) { - // no adjacent nodes - if (!adjacentNodes) { - return true; - } - - // e.g. " \nletter" - if (!adjacentNodes.previous || !adjacentNodes.next) { + if ( + // no adjacent nodes - this happens for linkReference and imageReference nodes + !adjacentNodes || + // e.g. " \nletter" + !adjacentNodes.previous || + !adjacentNodes.next + ) { return true; } @@ -252,7 +252,7 @@ function isBreakable(path, value, options, adjacentNodes, canBeSpace) { // On the contrary, if `false` even should be Space, the following loop will occur: // the surrounding lines are joined with `" "` -> divided into 2 lines by `" "` -> joined again -> ... if (violatesCJKLineBreakingRule) { - return value === "\n" ? canBeSpace : false; + return value === "\n" && canBeSpace; } return true; From 82b41535d5c82222f8958159c7d6207c6961a6a4 Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Mon, 17 Oct 2022 21:13:53 +0300 Subject: [PATCH 80/87] Another refactoring --- src/language-markdown/print-whitespace.js | 25 +++++++++++------------ 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index 216ed96a8899..0a0645092083 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -103,7 +103,7 @@ function isInSentenceWithCJSpaces(path) { * @param {AdjacentNodes | undefined} adjacentNodes adjacent sibling nodes of given node * @returns {boolean} `true` if given node can be convertedToSpace, `false` if not (i.e. newline or empty character) */ -function canBeConvertedToSpace(path, adjacentNodes) { +function lineBreakCanBeConvertedToSpace(path, adjacentNodes) { if ( // no adjacent nodes - this happens for linkReference and imageReference nodes !adjacentNodes || @@ -197,10 +197,10 @@ function isWesternOrKoreanLetter(kind) { } /** - * Returns whether “whitespace” (`"" | " " | "\n"`; see `WhitespaceValue`) can be converted to `"\n"` + * Checks whether whitespace can be printed as a line break * * @param {AstPath} path - * @param {WhitespaceValue} value + * @param {WhitespaceValue} value (`"" | " " | "\n"`) * @param {*} options * @param {AdjacentNodes | undefined} adjacentNodes * @param {boolean} canBeSpace @@ -222,25 +222,24 @@ function isBreakable(path, value, options, adjacentNodes, canBeSpace) { return true; } + const { previous, next } = adjacentNodes; + // Simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) // [Latin][""][Hangul] & vice versa => Don't break // [han & kana][""][hangul], either if ( value === "" && - ((adjacentNodes.previous?.kind === KIND_K_LETTER && - isLetter(adjacentNodes.next?.kind)) || - (adjacentNodes.next?.kind === KIND_K_LETTER && - isLetter(adjacentNodes.previous?.kind))) + ((previous?.kind === KIND_K_LETTER && isLetter(next?.kind)) || + (next?.kind === KIND_K_LETTER && isLetter(previous?.kind))) ) { return false; } // https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages const violatesCJKLineBreakingRule = - (adjacentNodes.next !== undefined && - noBreakBefore.has(adjacentNodes.next.value[0])) || - (adjacentNodes.previous !== undefined && - noBreakAfter.has(adjacentNodes.previous.value.at(-1))); + !canBeSpace && + ((next && noBreakBefore.has(next.value[0])) || + (previous && noBreakAfter.has(previous.value.at(-1)))); // When violates the CJK line breaking rules if breaks lines there ("") or leaves surrounding lines divided ("\n")... // - "" => always `false` (i.e. disallows to break lines there) @@ -252,7 +251,7 @@ function isBreakable(path, value, options, adjacentNodes, canBeSpace) { // On the contrary, if `false` even should be Space, the following loop will occur: // the surrounding lines are joined with `" "` -> divided into 2 lines by `" "` -> joined again -> ... if (violatesCJKLineBreakingRule) { - return value === "\n" && canBeSpace; + return false; } return true; @@ -271,7 +270,7 @@ function printWhitespace(path, value, options, adjacentNodes) { const canBeSpace = value === " " || - (value === "\n" && canBeConvertedToSpace(path, adjacentNodes)); + (value === "\n" && lineBreakCanBeConvertedToSpace(path, adjacentNodes)); if (isBreakable(path, value, options, adjacentNodes, canBeSpace)) { return canBeSpace ? line : softline; From cb0501920bf35d735420cd5ee28b3da320bdd669 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Wed, 19 Oct 2022 00:05:57 +0900 Subject: [PATCH 81/87] Improve comments --- src/language-markdown/print-whitespace.js | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index 0a0645092083..c6910bd2514c 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -245,11 +245,10 @@ function isBreakable(path, value, options, adjacentNodes, canBeSpace) { // - "" => always `false` (i.e. disallows to break lines there) // - "\n" => `true` (i.e. don't force surrounding lines to be joined) only when it should be converted to Space // `false` (i.e. join surrounding lines into a single line) otherwise - // The mandatory joining behavior when Space is not allowed is necessary because intentional violation of the line breaking rules (e.g. “ル\nール守れ\n!”) tends to be “corrected” ("\n" -> "") by formatted with a large value of `printWidth`. - // Eventually, by reformatted with a smaller value of `printWidth` or because of a paragraph revision, the rules are going to be applied to the place that used to violate them. + // The mandatory joining behavior ("\n" -> "") when Space is not allowed at "\n" is necessary because intentional violation of the line breaking rules (e.g. “ル\nール守れ\n!”) tends to be “corrected” ("\n" -> "") after re-editing the document or changing the value of `printWidth`. // e.g. “シ\nョ\nートカ\nット\n!\n?” (completely violates the rules on purpose) --[printWidth = 6]-->“ショー\nトカッ\nト!?” --[s/^/こんなところで/]--> “こんな\nところ\nで\nショー\nトカッ\nト!?” (completely complies to the rules) - // On the contrary, if `false` even should be Space, the following loop will occur: - // the surrounding lines are joined with `" "` -> divided into 2 lines by `" "` -> joined again -> ... + // On the contrary, if `false` even should be Space (joins lines with an unbreakable " "), the following loop will occur: + // the 2 lines surrounding `"\n"` are joined with `" "` -> divided into 2 lines by `" "` again -> joined again -> ... if (violatesCJKLineBreakingRule) { return false; } From 7810a4fb7b55c72ea743ff43b4040543d605f1dd Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Wed, 19 Oct 2022 00:11:39 +0900 Subject: [PATCH 82/87] Fix comment saying opposite --- src/language-markdown/print-whitespace.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index c6910bd2514c..9722e4957839 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -117,7 +117,7 @@ function lineBreakCanBeConvertedToSpace(path, adjacentNodes) { const previousKind = adjacentNodes.previous.kind; const nextKind = adjacentNodes.next.kind; - // "\n" between not western or Korean (han, kana, CJK punctuations) characters always can be converted to Space + // "\n" between western or Korean characters always can be converted to Space // Korean hangul simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) if ( isWesternOrKoreanLetter(previousKind) && From 0a27cbdf1bf69fc9770b7fb1acf489b55a21e64a Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Tue, 18 Oct 2022 18:34:13 +0300 Subject: [PATCH 83/87] Refactor, edit comments --- src/language-markdown/print-whitespace.js | 208 +++++++++++----------- src/language-markdown/printer-markdown.js | 11 +- 2 files changed, 110 insertions(+), 109 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index 9722e4957839..b772d6b45565 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -9,103 +9,112 @@ import { const SINGLE_LINE_NODE_TYPES = ["heading", "tableCell", "link", "wikiLink"]; -// https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages /** - * The set of characters that must not immediately precede a line break + * These characters must not immediately precede a line break. * - * e.g. `"("` + * e.g. `"("`: * * - Bad: `"檜原村(\nひのはらむら)"` - * - Good: `"檜原村\n(ひのはらむら)"` or ``"檜原村(ひ\nのはらむら)"` + * - Good: `"檜原村\n(ひのはらむら)"` or + * `"檜原村(ひ\nのはらむら)"` */ const noBreakAfter = new Set( "$(£¥·'\"〈《「『【〔〖〝﹙﹛$([{£¥[{‵︴︵︷︹︻︽︿﹁﹃﹏〘⦅«" ); /** - * The set of characters that must not immediately follow a line break + * These characters must not immediately follow a line break. * - * e.g. `")"` + * e.g. `")"`: * * - Bad: `"檜原村(ひのはらむら\n)以外には、"` - * - Good: `"檜原村(ひのはらむ\nら)以外には、"` or `"檜原村(ひのはらむら)\n以外には、"` + * - Good: `"檜原村(ひのはらむ\nら)以外には、"` or + * `"檜原村(ひのはらむら)\n以外には、"` */ const noBreakBefore = new Set( "!%),.:;?]}¢°·'\"†‡›℃∶、。〃〆〕〗〞﹚﹜!"%'),.:;?]}~–—•〉》」︰︱︲︳﹐﹑﹒﹓﹔﹕﹖﹘︶︸︺︼︾﹀﹂﹗|、』】〙〟⦆»ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻‐゠〜~‼⁇⁈⁉・" ); /** - * The set of characters whose surrounding newline may be converted to Space - * - * - ASCII punctuation marks + * A line break between a character from this set and CJ can be converted to a + * space. Includes only ASCII punctuation marks for now. */ -const lineBreakBetweenTheseAndCJKConvertsToSpace = new Set( +const lineBreakBetweenTheseAndCJConvertsToSpace = new Set( "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" ); /** - * Finds out if Space tends to be inserted between Chinese or Japanese characters - * (including ideograph aka han or kanji e.g. `字`, hiragana e.g. `あ`, and katakana e.g. `ア`) - * and other letters (including alphanumerics; e.g. `A` or `1`) in the sentence. + * Determine the preferred style of spacing between Chinese or Japanese and non-CJK + * characters in the parent `sentence` node. * - * @param {AstPath} path current position in nodes tree - * @returns {boolean} `true` if Space tends to be inserted between these types of letters, `false` otherwise. + * @param {AstPath} path + * @returns {boolean} `true` if Space tends to be inserted between CJ and + * non-CJK, `false` otherwise. */ -function isInSentenceWithCJSpaces(path) { - const sentenceNode = path.parent; - if (sentenceNode.usesCJSpaces !== undefined) { - return sentenceNode.usesCJSpaces; - } - - const stats = { " ": 0, "": 0 }; +function isInSentenceWithCJSpaces({ parent: sentenceNode }) { + if (sentenceNode.usesCJSpaces === undefined) { + const stats = { " ": 0, "": 0 }; + const { children } = sentenceNode; - for (let i = 1; i < sentenceNode.children.length - 1; ++i) { - const node = sentenceNode.children[i]; - if ( - node.type === "whitespace" && - (node.value === " " || node.value === "") - ) { - const previousKind = sentenceNode.children[i - 1].kind; - const nextKind = sentenceNode.children[i + 1].kind; + for (let i = 1; i < children.length - 1; ++i) { + const node = children[i]; if ( - (previousKind === KIND_CJ_LETTER && nextKind === KIND_NON_CJK) || - (previousKind === KIND_NON_CJK && nextKind === KIND_CJ_LETTER) + node.type === "whitespace" && + (node.value === " " || node.value === "") ) { - ++stats[node.value]; + const previousKind = children[i - 1].kind; + const nextKind = children[i + 1].kind; + if ( + (previousKind === KIND_CJ_LETTER && nextKind === KIND_NON_CJK) || + (previousKind === KIND_NON_CJK && nextKind === KIND_CJ_LETTER) + ) { + ++stats[node.value]; + } } } + + // Inject a property to cache the result. + sentenceNode.usesCJSpaces = stats[" "] > stats[""]; } - // Injects a property to cache the result. - sentenceNode.usesCJSpaces = stats[" "] > stats[""]; return sentenceNode.usesCJSpaces; } /** * @typedef {import("./utils.js").WordNode} WordNode * @typedef {import("./utils.js").WhitespaceValue} WhitespaceValue - * @typedef {{ next?: WordNode | null, previous?: WordNode | null }} AdjacentNodes - * Adjacent node to `WhitespaceNode`. the consecution of `WhitespaceNode` is a bug, so adjacent nodes must be `WordNode`. + * @typedef {{ next?: WordNode | null, previous?: WordNode | null }} + * AdjacentNodes Nodes adjacent to a `whitespace` node. Are always of type + * `word`. * @typedef {import("./utils.js").WordKind} WordKind * @typedef {import("../common/ast-path.js").default} AstPath + * @typedef {"always" | "never" | "preserve"} ProseWrap */ /** - * Checks if given `"\n"` node can be converted to Space + * Check whether the given `"\n"` node can be converted to a space. + * + * For example, if you would like to squash English text + * + * "You might want\nto use Prettier." * - * For example, if you would like to squash the multi-line string `"You might want\nto use Prettier."` into a single line, - * you would replace `"\n"` with `" "`. (`"You might want to use Prettier."`) + * into a single line, you would replace `"\n"` with `" "`: * - * However, you should note that Chinese and Japanese do not use U+0020 Space to divide words, so U+000A End of Line must not be replaced with it. - * Behavior in other languages (e.g. Thai) will not be changed because there are too much things to consider. (PR welcome) + * "You might want to use Prettier." + * + * However, Chinese and Japanese don't use U+0020 Space to divide words, so line + * breaks shouldn't be replaced with spaces for those languages. + * + * PRs are welcome to support line breaking rules for other languages. * * @param {AstPath} path path of given node - * @param {AdjacentNodes | undefined} adjacentNodes adjacent sibling nodes of given node - * @returns {boolean} `true` if given node can be convertedToSpace, `false` if not (i.e. newline or empty character) + * @param {AdjacentNodes | undefined} adjacentNodes adjacent sibling nodes of + * the given node + * @returns {boolean} */ function lineBreakCanBeConvertedToSpace(path, adjacentNodes) { if ( - // no adjacent nodes - this happens for linkReference and imageReference nodes + // no adjacent nodes - happens for linkReference and imageReference nodes !adjacentNodes || // e.g. " \nletter" !adjacentNodes.previous || @@ -117,56 +126,62 @@ function lineBreakCanBeConvertedToSpace(path, adjacentNodes) { const previousKind = adjacentNodes.previous.kind; const nextKind = adjacentNodes.next.kind; - // "\n" between western or Korean characters always can be converted to Space - // Korean hangul simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) - if ( - isWesternOrKoreanLetter(previousKind) && - isWesternOrKoreanLetter(nextKind) - ) { - return true; - } - - // han & hangul: same way preferred if ( + // "\n" between non-CJK or Korean characters always can be converted to a + // space. Korean Hangul simulates Latin words. See + // https://github.com/prettier/prettier/issues/6516 + (isNonCJKOrKoreanLetter(previousKind) && + isNonCJKOrKoreanLetter(nextKind)) || + // Han & Hangul: same way preferred (previousKind === KIND_K_LETTER && nextKind === KIND_CJ_LETTER) || (nextKind === KIND_K_LETTER && previousKind === KIND_CJ_LETTER) ) { return true; } - // Do not convert it to Space when: + // Do not convert \n to a space: if ( - // Shall not be converted to Space around CJK punctuation + // around CJK punctuation previousKind === KIND_CJK_PUNCTUATION || nextKind === KIND_CJK_PUNCTUATION || - // "\n" between CJ always SHALL NOT be convertedToSpace + // between CJ (previousKind === KIND_CJ_LETTER && nextKind === KIND_CJ_LETTER) ) { return false; } + // The rest of this function deals only with line breaks between CJ and + // non-CJK characters. + const characterBefore = adjacentNodes.previous.value.at(-1); const characterAfter = adjacentNodes.next.value[0]; - // From here down, only line breaks between CJ and non-CJK characters are covered. - - // Convert newline between CJ and specific symbol characters (e.g. ASCII punctuation) to Space. - // e.g. :::\n句子句子句子\n::: → ::: 句子句子句子 ::: + // Convert a line break between CJ and certain non-letter characters (e.g. + // ASCII punctuation) to a space. // - // Note: Line breaks like "(\n句子句子\n)" or "句子\n." by Prettier are suppressed in `isBreakable(...)`. + // E.g. :::\n句子句子句子\n::: → ::: 句子句子句子 ::: + // + // Note: line breaks like "(\n句子句子\n)" or "句子\n." are suppressed in + // `isBreakable(...)`. if ( - lineBreakBetweenTheseAndCJKConvertsToSpace.has(characterAfter) || - lineBreakBetweenTheseAndCJKConvertsToSpace.has(characterBefore) + lineBreakBetweenTheseAndCJConvertsToSpace.has(characterAfter) || + lineBreakBetweenTheseAndCJConvertsToSpace.has(characterBefore) ) { return true; } - // Converting newline between CJ and non-ASCII punctuation to Space does not seem to be better in many cases. (PR welcome) - // e.g. - // 1. “ア〜エの中から1つ選べ。” - // "〜" (U+301C) belongs to Pd, and "\n" in "ア〜\nエの中から1つ選べ。" must not be converted to Space. - // 2. “これはひどい……なんと汚いコミットログなんだ……” - // "…" (U+2026) belongs to Po, and "\n" in "これはひどい……\nなんと汚いコミットログなんだ……" must not, either. + // Converting a line break between CJ and non-ASCII punctuation to a space is + // undesired in many cases. PRs are welcome to fine-tune this logic. + // + // Examples where \n must not be converted to a space: + // + // 1. "〜" (U+301C, belongs to Pd) in + // + // "ア〜\nエの中から1つ選べ。" + // + // 2. "…" (U+2026, belongs to Po) in + // + // "これはひどい……\nなんと汚いコミットログなんだ……" if ( adjacentNodes.previous.hasTrailingPunctuation || adjacentNodes.next.hasLeadingPunctuation @@ -174,13 +189,14 @@ function lineBreakCanBeConvertedToSpace(path, adjacentNodes) { return false; } - // If the sentence uses the style with spaces between CJ and alphanumerics, "\n" can be converted to Space. + // If the sentence uses the style with spaces between CJ and non-CJK, "\n" can + // be converted to a space. return isInSentenceWithCJSpaces(path); } /** * @param {WordKind | undefined} kind - * @returns {boolean} `true` if `kind` is letter (not CJK punctuation) + * @returns {boolean} `true` if `kind` is defined and not CJK punctuation */ function isLetter(kind) { return ( @@ -190,33 +206,31 @@ function isLetter(kind) { /** * @param {WordKind | undefined} kind - * @returns {boolean} `true` if `kind` is western or Korean letters (divides words by Space) + * @returns {boolean} `true` if `kind` is Korean letter or non-CJK */ -function isWesternOrKoreanLetter(kind) { +function isNonCJKOrKoreanLetter(kind) { return kind === KIND_NON_CJK || kind === KIND_K_LETTER; } /** - * Checks whether whitespace can be printed as a line break + * Check whether whitespace can be printed as a line break. * * @param {AstPath} path - * @param {WhitespaceValue} value (`"" | " " | "\n"`) - * @param {*} options + * @param {WhitespaceValue} value + * @param {ProseWrap} proseWrap * @param {AdjacentNodes | undefined} adjacentNodes * @param {boolean} canBeSpace - * @returns {boolean} `true` if “whitespace” can be converted to `"\n"` + * @returns {boolean} */ -function isBreakable(path, value, options, adjacentNodes, canBeSpace) { - if ( - options.proseWrap !== "always" || - getAncestorNode(path, SINGLE_LINE_NODE_TYPES) - ) { +function isBreakable(path, value, proseWrap, adjacentNodes, canBeSpace) { + if (proseWrap !== "always" || getAncestorNode(path, SINGLE_LINE_NODE_TYPES)) { return false; } if ( + // no adjacent nodes - happens for linkReference and imageReference nodes adjacentNodes === undefined || - // Space are always breakable + // Spaces are always breakable value === " " ) { return true; @@ -224,9 +238,9 @@ function isBreakable(path, value, options, adjacentNodes, canBeSpace) { const { previous, next } = adjacentNodes; - // Simulates latin words; see #6516 (https://github.com/prettier/prettier/issues/6516) + // Simulates Latin words; see https://github.com/prettier/prettier/issues/6516 // [Latin][""][Hangul] & vice versa => Don't break - // [han & kana][""][hangul], either + // [Han & Kana][""][Hangul], either if ( value === "" && ((previous?.kind === KIND_K_LETTER && isLetter(next?.kind)) || @@ -241,14 +255,6 @@ function isBreakable(path, value, options, adjacentNodes, canBeSpace) { ((next && noBreakBefore.has(next.value[0])) || (previous && noBreakAfter.has(previous.value.at(-1)))); - // When violates the CJK line breaking rules if breaks lines there ("") or leaves surrounding lines divided ("\n")... - // - "" => always `false` (i.e. disallows to break lines there) - // - "\n" => `true` (i.e. don't force surrounding lines to be joined) only when it should be converted to Space - // `false` (i.e. join surrounding lines into a single line) otherwise - // The mandatory joining behavior ("\n" -> "") when Space is not allowed at "\n" is necessary because intentional violation of the line breaking rules (e.g. “ル\nール守れ\n!”) tends to be “corrected” ("\n" -> "") after re-editing the document or changing the value of `printWidth`. - // e.g. “シ\nョ\nートカ\nット\n!\n?” (completely violates the rules on purpose) --[printWidth = 6]-->“ショー\nトカッ\nト!?” --[s/^/こんなところで/]--> “こんな\nところ\nで\nショー\nトカッ\nト!?” (completely complies to the rules) - // On the contrary, if `false` even should be Space (joins lines with an unbreakable " "), the following loop will occur: - // the 2 lines surrounding `"\n"` are joined with `" "` -> divided into 2 lines by `" "` again -> joined again -> ... if (violatesCJKLineBreakingRule) { return false; } @@ -259,11 +265,11 @@ function isBreakable(path, value, options, adjacentNodes, canBeSpace) { /** * @param {AstPath} path * @param {WhitespaceValue} value - * @param {*} options - * @param {AdjacentNodes | undefined} [adjacentNodes] + * @param {ProseWrap} proseWrap + * @param {AdjacentNodes} [adjacentNodes] */ -function printWhitespace(path, value, options, adjacentNodes) { - if (options.proseWrap === "preserve" && value === "\n") { +function printWhitespace(path, value, proseWrap, adjacentNodes) { + if (proseWrap === "preserve" && value === "\n") { return hardline; } @@ -271,7 +277,7 @@ function printWhitespace(path, value, options, adjacentNodes) { value === " " || (value === "\n" && lineBreakCanBeConvertedToSpace(path, adjacentNodes)); - if (isBreakable(path, value, options, adjacentNodes, canBeSpace)) { + if (isBreakable(path, value, proseWrap, adjacentNodes, canBeSpace)) { return canBeSpace ? line : softline; } diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index f355e6a16101..718ad19827bd 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -70,7 +70,7 @@ function genericPrint(path, options, print) { : printWhitespace( path, node.value, - options + options.proseWrap // Without the `adjacentNodes` argument `printWhitespace` wraps/unwraps // text in such a way that the normalized form of a link label stays the same. // See https://spec.commonmark.org/0.30/#matches @@ -138,7 +138,7 @@ function genericPrint(path, options, print) { } case "whitespace": { // WordNode or nullish - const { next, previous } = path; + const { previous, next } = path; const proseWrap = // leading char that may cause different syntax @@ -146,12 +146,7 @@ function genericPrint(path, options, print) { ? "never" : options.proseWrap; - return printWhitespace( - path, - node.value, - { proseWrap }, - { previous, next } - ); + return printWhitespace(path, node.value, proseWrap, { previous, next }); } case "emphasis": { let style; From d65be2bc8fcd4fc115431aa3b2872452fa4786ad Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Tue, 18 Oct 2022 23:36:17 +0300 Subject: [PATCH 84/87] Refactor: more explicit parameter for links --- src/language-markdown/print-whitespace.js | 65 +++++++++++------------ src/language-markdown/printer-markdown.js | 16 ++---- 2 files changed, 33 insertions(+), 48 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index b772d6b45565..6ee98639ea50 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -83,9 +83,6 @@ function isInSentenceWithCJSpaces({ parent: sentenceNode }) { /** * @typedef {import("./utils.js").WordNode} WordNode * @typedef {import("./utils.js").WhitespaceValue} WhitespaceValue - * @typedef {{ next?: WordNode | null, previous?: WordNode | null }} - * AdjacentNodes Nodes adjacent to a `whitespace` node. Are always of type - * `word`. * @typedef {import("./utils.js").WordKind} WordKind * @typedef {import("../common/ast-path.js").default} AstPath * @typedef {"always" | "never" | "preserve"} ProseWrap @@ -107,24 +104,24 @@ function isInSentenceWithCJSpaces({ parent: sentenceNode }) { * * PRs are welcome to support line breaking rules for other languages. * - * @param {AstPath} path path of given node - * @param {AdjacentNodes | undefined} adjacentNodes adjacent sibling nodes of - * the given node + * @param {AstPath} path + * @param {boolean} isLink * @returns {boolean} */ -function lineBreakCanBeConvertedToSpace(path, adjacentNodes) { - if ( - // no adjacent nodes - happens for linkReference and imageReference nodes - !adjacentNodes || - // e.g. " \nletter" - !adjacentNodes.previous || - !adjacentNodes.next - ) { +function lineBreakCanBeConvertedToSpace(path, isLink) { + if (isLink) { + return true; + } + + const { previous, next } = path; + + // e.g. " \nletter" + if (!previous || !next) { return true; } - const previousKind = adjacentNodes.previous.kind; - const nextKind = adjacentNodes.next.kind; + const previousKind = previous.kind; + const nextKind = next.kind; if ( // "\n" between non-CJK or Korean characters always can be converted to a @@ -153,8 +150,8 @@ function lineBreakCanBeConvertedToSpace(path, adjacentNodes) { // The rest of this function deals only with line breaks between CJ and // non-CJK characters. - const characterBefore = adjacentNodes.previous.value.at(-1); - const characterAfter = adjacentNodes.next.value[0]; + const characterBefore = previous.value.at(-1); + const characterAfter = next.value[0]; // Convert a line break between CJ and certain non-letter characters (e.g. // ASCII punctuation) to a space. @@ -182,10 +179,7 @@ function lineBreakCanBeConvertedToSpace(path, adjacentNodes) { // 2. "…" (U+2026, belongs to Po) in // // "これはひどい……\nなんと汚いコミットログなんだ……" - if ( - adjacentNodes.previous.hasTrailingPunctuation || - adjacentNodes.next.hasLeadingPunctuation - ) { + if (previous.hasTrailingPunctuation || next.hasLeadingPunctuation) { return false; } @@ -218,25 +212,25 @@ function isNonCJKOrKoreanLetter(kind) { * @param {AstPath} path * @param {WhitespaceValue} value * @param {ProseWrap} proseWrap - * @param {AdjacentNodes | undefined} adjacentNodes + * @param {boolean} isLink * @param {boolean} canBeSpace * @returns {boolean} */ -function isBreakable(path, value, proseWrap, adjacentNodes, canBeSpace) { +function isBreakable(path, value, proseWrap, isLink, canBeSpace) { if (proseWrap !== "always" || getAncestorNode(path, SINGLE_LINE_NODE_TYPES)) { return false; } - if ( - // no adjacent nodes - happens for linkReference and imageReference nodes - adjacentNodes === undefined || - // Spaces are always breakable - value === " " - ) { + if (isLink) { + return value !== ""; + } + + // Spaces are always breakable + if (value === " ") { return true; } - const { previous, next } = adjacentNodes; + const { previous, next } = path; // Simulates Latin words; see https://github.com/prettier/prettier/issues/6516 // [Latin][""][Hangul] & vice versa => Don't break @@ -266,18 +260,19 @@ function isBreakable(path, value, proseWrap, adjacentNodes, canBeSpace) { * @param {AstPath} path * @param {WhitespaceValue} value * @param {ProseWrap} proseWrap - * @param {AdjacentNodes} [adjacentNodes] + * @param {boolean} [isLink] Wrap/unwrap text so that the normalized form of a + * link label stays the same. See https://spec.commonmark.org/0.30/#matches */ -function printWhitespace(path, value, proseWrap, adjacentNodes) { +function printWhitespace(path, value, proseWrap, isLink) { if (proseWrap === "preserve" && value === "\n") { return hardline; } const canBeSpace = value === " " || - (value === "\n" && lineBreakCanBeConvertedToSpace(path, adjacentNodes)); + (value === "\n" && lineBreakCanBeConvertedToSpace(path, isLink)); - if (isBreakable(path, value, proseWrap, adjacentNodes, canBeSpace)) { + if (isBreakable(path, value, proseWrap, isLink, canBeSpace)) { return canBeSpace ? line : softline; } diff --git a/src/language-markdown/printer-markdown.js b/src/language-markdown/printer-markdown.js index 718ad19827bd..d29853284ae8 100644 --- a/src/language-markdown/printer-markdown.js +++ b/src/language-markdown/printer-markdown.js @@ -65,16 +65,7 @@ function genericPrint(path, options, print) { ).map((node) => node.type === "word" ? node.value - : node.value === "" - ? "" - : printWhitespace( - path, - node.value, - options.proseWrap - // Without the `adjacentNodes` argument `printWhitespace` wraps/unwraps - // text in such a way that the normalized form of a link label stays the same. - // See https://spec.commonmark.org/0.30/#matches - ) + : printWhitespace(path, node.value, options.proseWrap, true) ); } @@ -137,8 +128,7 @@ function genericPrint(path, options, print) { return escapedValue; } case "whitespace": { - // WordNode or nullish - const { previous, next } = path; + const { next } = path; const proseWrap = // leading char that may cause different syntax @@ -146,7 +136,7 @@ function genericPrint(path, options, print) { ? "never" : options.proseWrap; - return printWhitespace(path, node.value, proseWrap, { previous, next }); + return printWhitespace(path, node.value, proseWrap); } case "emphasis": { let style; From 4e145aed5895563ab97b81c95e730ad41c25d779 Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Tue, 18 Oct 2022 23:58:30 +0300 Subject: [PATCH 85/87] Minor refactoring --- src/language-markdown/print-whitespace.js | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index 6ee98639ea50..c28b158ce806 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -150,9 +150,6 @@ function lineBreakCanBeConvertedToSpace(path, isLink) { // The rest of this function deals only with line breaks between CJ and // non-CJK characters. - const characterBefore = previous.value.at(-1); - const characterAfter = next.value[0]; - // Convert a line break between CJ and certain non-letter characters (e.g. // ASCII punctuation) to a space. // @@ -161,8 +158,8 @@ function lineBreakCanBeConvertedToSpace(path, isLink) { // Note: line breaks like "(\n句子句子\n)" or "句子\n." are suppressed in // `isBreakable(...)`. if ( - lineBreakBetweenTheseAndCJConvertsToSpace.has(characterAfter) || - lineBreakBetweenTheseAndCJConvertsToSpace.has(characterBefore) + lineBreakBetweenTheseAndCJConvertsToSpace.has(next.value[0]) || + lineBreakBetweenTheseAndCJConvertsToSpace.has(previous.value.at(-1)) ) { return true; } @@ -244,12 +241,12 @@ function isBreakable(path, value, proseWrap, isLink, canBeSpace) { } // https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages - const violatesCJKLineBreakingRule = + const violatesCJKLineBreakingRules = !canBeSpace && ((next && noBreakBefore.has(next.value[0])) || (previous && noBreakAfter.has(previous.value.at(-1)))); - if (violatesCJKLineBreakingRule) { + if (violatesCJKLineBreakingRules) { return false; } @@ -260,8 +257,8 @@ function isBreakable(path, value, proseWrap, isLink, canBeSpace) { * @param {AstPath} path * @param {WhitespaceValue} value * @param {ProseWrap} proseWrap - * @param {boolean} [isLink] Wrap/unwrap text so that the normalized form of a - * link label stays the same. See https://spec.commonmark.org/0.30/#matches + * @param {boolean} [isLink] Special mode of (un)wrapping that preserves the + * normalized form of link labels. https://spec.commonmark.org/0.30/#matches */ function printWhitespace(path, value, proseWrap, isLink) { if (proseWrap === "preserve" && value === "\n") { From 86a41b5766b7f6fa19cb4f23dde63d58a0311f7d Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Wed, 19 Oct 2022 01:13:31 +0300 Subject: [PATCH 86/87] Impove types --- src/language-markdown/print-whitespace.js | 5 +++++ src/language-markdown/utils.js | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index c28b158ce806..0a884d4dc1e5 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -86,6 +86,9 @@ function isInSentenceWithCJSpaces({ parent: sentenceNode }) { * @typedef {import("./utils.js").WordKind} WordKind * @typedef {import("../common/ast-path.js").default} AstPath * @typedef {"always" | "never" | "preserve"} ProseWrap + * @typedef {{ next?: WordNode | null, previous?: WordNode | null }} + * AdjacentNodes Nodes adjacent to a `whitespace` node. Are always of type + * `word`. */ /** @@ -113,6 +116,7 @@ function lineBreakCanBeConvertedToSpace(path, isLink) { return true; } + /** @type {AdjacentNodes} */ const { previous, next } = path; // e.g. " \nletter" @@ -227,6 +231,7 @@ function isBreakable(path, value, proseWrap, isLink, canBeSpace) { return true; } + /** @type {AdjacentNodes} */ const { previous, next } = path; // Simulates Latin words; see https://github.com/prettier/prettier/issues/6516 diff --git a/src/language-markdown/utils.js b/src/language-markdown/utils.js index 9ae051340273..139142b2e53e 100644 --- a/src/language-markdown/utils.js +++ b/src/language-markdown/utils.js @@ -48,9 +48,7 @@ const KIND_CJK_PUNCTUATION = "cjk-punctuation"; * @typedef {{ * type: "whitespace", * value: WhitespaceValue, - * kind?: undefined, - * hasLeadingPunctuation?: undefined, - * hasTrailingPunctuation?: undefined, + * kind?: never * }} WhitespaceNode * @typedef {{ * type: "word", From 1d703d2e643c9061681bf884e8aa30dac597da40 Mon Sep 17 00:00:00 2001 From: Georgii Dolzhykov Date: Wed, 19 Oct 2022 01:16:49 +0300 Subject: [PATCH 87/87] Move typedefs after imports --- src/language-markdown/print-whitespace.js | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/language-markdown/print-whitespace.js b/src/language-markdown/print-whitespace.js index 0a884d4dc1e5..7a865b055b21 100644 --- a/src/language-markdown/print-whitespace.js +++ b/src/language-markdown/print-whitespace.js @@ -7,6 +7,17 @@ import { getAncestorNode, } from "./utils.js"; +/** + * @typedef {import("./utils.js").WordNode} WordNode + * @typedef {import("./utils.js").WhitespaceValue} WhitespaceValue + * @typedef {import("./utils.js").WordKind} WordKind + * @typedef {import("../common/ast-path.js").default} AstPath + * @typedef {"always" | "never" | "preserve"} ProseWrap + * @typedef {{ next?: WordNode | null, previous?: WordNode | null }} + * AdjacentNodes Nodes adjacent to a `whitespace` node. Are always of type + * `word`. + */ + const SINGLE_LINE_NODE_TYPES = ["heading", "tableCell", "link", "wikiLink"]; /** @@ -80,17 +91,6 @@ function isInSentenceWithCJSpaces({ parent: sentenceNode }) { return sentenceNode.usesCJSpaces; } -/** - * @typedef {import("./utils.js").WordNode} WordNode - * @typedef {import("./utils.js").WhitespaceValue} WhitespaceValue - * @typedef {import("./utils.js").WordKind} WordKind - * @typedef {import("../common/ast-path.js").default} AstPath - * @typedef {"always" | "never" | "preserve"} ProseWrap - * @typedef {{ next?: WordNode | null, previous?: WordNode | null }} - * AdjacentNodes Nodes adjacent to a `whitespace` node. Are always of type - * `word`. - */ - /** * Check whether the given `"\n"` node can be converted to a space. *