eemeli · eemeli · Apr 11, 2021 · Apr 3, 2021 · Apr 3, 2021 · Apr 3, 2021
diff --git a/README.md b/README.md
@@ -74,9 +74,9 @@ const YAML = require('yaml')
 
 ### Parsing YAML
 
-- [`new Lexer(push)`](https://eemeli.org/yaml/#lexer)
-- [`new Parser(push, onNewLine?)`](https://eemeli.org/yaml/#parser)
-- [`new Composer(push, options?)`](https://eemeli.org/yaml/#composer)
+- [`new Lexer().lex(src)`](https://eemeli.org/yaml/#lexer)
+- [`new Parser(onNewLine?).parse(src)`](https://eemeli.org/yaml/#parser)
+- [`new Composer(options?).compose(tokens)`](https://eemeli.org/yaml/#composer)
 
 ## YAML.parse
 

diff --git a/docs/01_intro.md b/docs/01_intro.md
@@ -96,6 +96,6 @@ import {
 import { Composer, Lexer, Parser } from 'yaml'
 ```
 
-- [`new Lexer(push)`](#lexer)
-- [`new Parser(push, onNewLine?)`](#parser)
-- [`new Composer(push, options?)`](#composer)
+- [`new Lexer().lex(src)`](#lexer)
+- [`new Parser(onNewLine?).parse(src)`](#parser)
+- [`new Composer(options?).compose(tokens)`](#composer)
diff --git a/docs/07_parsing_yaml.md b/docs/07_parsing_yaml.md
@@ -3,11 +3,11 @@
 <!-- prettier-ignore -->
 ```js
 import {
-  Lexer,
-  Parser,
   Composer,
+  CST,
+  Lexer,
   LineCounter,
-  tokens
+  Parser,
 } from 'yaml'
 ```
 
@@ -28,10 +28,8 @@ Both the Lexer and Parser accept incomplete input, allowing for them and the Com
 ```js
 import { Lexer } from 'yaml'
 
-const tokens = []
-const lexer = new Lexer(tok => tokens.push(tok))
-lexer.lex('foo: bar\nfee:\n  [24,"42"]\n', false)
-console.dir(tokens)
+const tokens = new Lexer().lex('foo: bar\nfee:\n  [24,"42"]\n')
+console.dir(Array.from(tokens))
 > [
     '\x02', '\x1F', 'foo',  ':',
     ' ',    '\x1F', 'bar',  '\n',
@@ -41,12 +39,11 @@ console.dir(tokens)
   ]
 ```
 
-#### `new Lexer(push: (token: string) => void)`
+#### `new Lexer()`
 
-#### `lexer.lex(src: string, incomplete: boolean): void`
+#### `lexer.lex(src: string, incomplete?: boolean): Generator<string>`
 
 The API for the lexer is rather minimal, and offers no configuration.
-The constructor accepts a single callback as argument, defining a function that will be called once for each lexical token.
 If the input stream is chunked, the `lex()` method may be called separately for each chunk if the `incomplete` argument is `true`.
 At the end of input, `lex()` should be called a final time with `incomplete: false` to ensure that the remaining tokens are emitted.
 
@@ -97,8 +94,8 @@ All remaining tokens are identifiable by their first character:
 ```js
 import { Parser } from 'yaml'
 
-const parser = new Parser(tok => console.dir(tok, { depth: null }))
-parser.parse('foo: [24,"42"]\n', false)
+for (const token of new Parser().parse('foo: [24,"42"]\n'))
+  console.dir(token, { depth: null })
 
 > {
     type: 'document',
@@ -153,31 +150,28 @@ It should never throw errors, but may (rarely) include error tokens in its outpu
 To validate a CST, you will need to compose it into a `Document`.
 If the document contains errors, they will be included in the document's `errors` array, and each error will will contain an `offset` within the source string, which you may then use to find the corresponding node in the CST.
 
-#### `new Parser(push: (token: Token) => void, onNewLine?: (offset: number) => void)`
+#### `new Parser(onNewLine?: (offset: number) => void)`
 
 Create a new parser.
-`push` is called separately with each parsed token.
 If defined, `onNewLine` is called separately with the start position of each new line (in `parse()`, including the start of input).
 
-#### `parser.parse(source: string, incomplete = false)`
+#### `parser.parse(source: string, incomplete = false): Generator<Token, void>`
 
-Parse `source` as a YAML stream, calling `push` with each directive, document and other structure as it is completely parsed.
+Parse `source` as a YAML stream, generating tokens for each directive, document and other structure as it is completely parsed.
 If `incomplete`, a part of the last line may be left as a buffer for the next call.
 
-Errors are not thrown, but pushed out as `{ type: 'error', offset, message }` tokens.
+Errors are not thrown, but are yielded as `{ type: 'error', offset, message }` tokens.
 
-#### `parser.next(lexToken: string)`
+#### `parser.next(lexToken: string): Generator<Token, void>`
 
 Advance the parser by one lexical token.
-Bound to the Parser instance, so may be used directly as a callback function.
-
 Used internally by `parser.parse()`; exposed to allow for use with an external lexer.
 
 For debug purposes, if the `LOG_TOKENS` env var is true-ish, all lexical tokens will be pretty-printed using `console.log()` as they are being processed.
 
 ### CST Nodes
 
-For a complete description of CST node interfaces, please consult the [tokens.ts source](https://github.com/eemeli/yaml/blob/master/src/parse/tokens.ts).
+For a complete description of CST node interfaces, please consult the [cst.ts source](https://github.com/eemeli/yaml/blob/master/src/parse/cst.ts).
 
 Some of the most common node properties include:
 
@@ -205,8 +199,9 @@ Collection items contain some subset of the following properties:
 import { LineCounter, Parser } from 'yaml'
 
 const lineCounter = new LineCounter()
-const parser = new Parser(() => {}, lineCounter.addNewLine))
-parser.parse('foo:\n- 24\n- "42"\n')
+const parser = new Parser(lineCounter.addNewLine))
+const tokens = parser.parse('foo:\n- 24\n- "42"\n')
+Array.from(tokens) // forces iteration
 
 lineCounter.lineStarts
 > [ 0, 5, 10, 17 ]
@@ -236,28 +231,31 @@ If `line === 0`, `addNewLine` has never been called or `offset` is before the fi
 <!-- prettier-ignore -->
 ```js
 import { Composer, Parser } from 'yaml'
-const docs = []
-const composer = new Composer(doc => docs.push(doc))
-const parser = new Parser(composer.next)
-parser.parse('foo: bar\nfee: [24, "42"]')
-composer.end()
 
-docs.map(doc => doc.toJS())
+const src = 'foo: bar\nfee: [24, "42"]'
+const tokens = new Parser().parse(src)
+const docs = new Composer().compose(tokens)
+
+Array.from(docs, doc => doc.toJS())
 > [{ foo: 'bar', fee: [24, '42'] }]
 ```
 
-#### `new Composer(push: (doc: Document.Parsed) => void, options?: Options)`
+#### `new Composer(options?: ParseOptions & DocumentOptions & SchemaOptions)`
 
 Create a new Document composer.
 Does not include an internal Parser instance, so an external one will be needed.
-`options` will be used during composition, and passed to the `new Document` constructor; may include any of ParseOptions, DocumentOptions, and SchemaOptions.
+`options` will be used during composition, and passed to the `new Document` constructor.
+
+#### `composer.compose(tokens: Iterable<Token>, forceDoc?: boolean, endOffset?: number): Generator<Document.Parsed>`
 
-#### `composer.next(token: Token)`
+Compose tokens into documents.
+Convenience wrapper combining calls to `composer.next()` and `composer.end()`.
+
+#### `composer.next(token: Token): Generator<Document.Parsed>`
 
 Advance the composed by one CST token.
-Bound to the Composer instance, so may be used directly as a callback function.
 
-#### `composer.end(forceDoc?: boolean, offset?: number)`
+#### `composer.end(forceDoc?: boolean, offset?: number): Generator<Document.Parsed>`
 
 Always call at end of input to push out any remaining document.
 If `forceDoc` is true and the stream contains no document, still emit a final document including any comments and directives that would be applied to a subsequent document.
@@ -267,3 +265,154 @@ If `forceDoc` is true and the stream contains no document, still emit a final do
 
 Current stream status information.
 Mostly useful at the end of input for an empty stream.
+
+## Working with CST Tokens
+
+```ts
+import { CST } from 'yaml'
+```
+
+For most use cases, the Document or pure JS interfaces provided by the library are the right tool.
+Sometimes, though, it's important to keep the original YAML source in as pristine a condition as possible.
+For those cases, the concrete syntax tree (CST) representation is provided, as it retains every character of the input, including whitespace.
+
+#### `CST.createScalarToken(value: string, context): BlockScalar | FlowScalar`
+
+Create a new scalar token with the value `value`.
+Values that represent an actual string but may be parsed as a different type should use a `type` other than `'PLAIN'`,
+as this function does not support any schema operations and won't check for such conflicts.
+
+| Argument            | Type            | Default | Description                                                                                                                   |
+| ------------------- | --------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| value               | `string`        |         | The string representation of the value, which will have its content properly indented. **Required.**                          |
+| context.end         | `SourceToken[]` |         | Comments and whitespace after the end of the value, or after the block scalar header. If undefined, a newline will be added.  |
+| context.implicitKey | `boolean`       | `false` | Being within an implicit key may affect the resolved type of the token's value.                                               |
+| context.indent      | `number`        |         | The indent level of the token. **Required.**                                                                                  |
+| context.inFlow      | `boolean`       | `false` | Is this scalar within a flow collection? This may affect the resolved type of the token's value.                              |
+| context.offset      | `number`        | `-1`    | The offset position of the token.                                                                                             |
+| context.type        | `Scalar.Type`   |         | The preferred type of the scalar token. If undefined, the previous type of the `token` will be used, defaulting to `'PLAIN'`. |
+
+<!-- prettier-ignore -->
+```js
+const [doc] = new Parser().parse('foo: "bar" #comment')
+const item = doc.value.items[0].value
+> {
+    type: 'double-quoted-scalar',
+    offset: 5,
+    indent: 0,
+    source: '"bar"',
+    end: [
+      { type: 'space', offset: 10, indent: 0, source: ' ' },
+      { type: 'comment', offset: 11, indent: 0, source: '#comment' }
+    ]
+  }
+
+YAML.resolveAsScalar(item)
+> { value: 'bar', type: 'QUOTE_DOUBLE', comment: 'comment', length: 14 }
+```
+
+#### `CST.isCollection(token?: Token): boolean`
+
+#### `CST.isScalar(token?: Token): boolean`
+
+Custom type guards for detecting CST collections and scalars, in both their block and flow forms.
+
+#### `CST.resolveAsScalar(token?: Token, strict = true, onError?: ComposeErrorHandler)`
+
+If `token` is a CST flow or block scalar, determine its string value and a few other attributes.
+Otherwise, return `null`.
+
+#### `CST.setScalarValue(token: Token, value: string, context?)`
+
+Set the value of `token` to the given string `value`, overwriting any previous contents and type that it may have.
+
+Best efforts are made to retain any comments previously associated with the `token`, though all contents within a collection's `items` will be overwritten.
+
+Values that represent an actual string but may be parsed as a different type should use a `type` other than `'PLAIN'`, as this function does not support any schema operations and won't check for such conflicts.
+
+| Argument            | Type          | Default | Description                                                                                                                     |
+| ------------------- | ------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------- |
+| token               | `Token`       |         | Any token. If it does not include an `indent` value, the value will be stringified as if it were an implicit key. **Required.** |
+| value               | `string`      |         | The string representation of the value, which will have its content properly indented. **Required.**                            |
+| context.afterKey    | `boolean`     | `false` | In most cases, values after a key should have an additional level of indentation.                                               |
+| context.implicitKey | `boolean`     | `false` | Being within an implicit key may affect the resolved type of the token's value.                                                 |
+| context.inFlow      | `boolean`     | `false` | Being within a flow collection may affect the resolved type of the token's value.                                               |
+| context.type        | `Scalar.Type` |         | The preferred type of the scalar token. If undefined, the previous type of the `token` will be used, defaulting to `'PLAIN'`.   |
+
+```ts
+function findScalarAtOffset(
+  cst: CST.Document,
+  offset: number
+): CST.FlowScalar | CST.BlockScalar | undefined {
+  let res: CST.FlowScalar | CST.BlockScalar | undefined = undefined
+  CST.visit(cst, ({ key, value }) => {
+    for (const token of [key, value])
+      if (CST.isScalar(token)) {
+        if (token.offset > offset) return CST.visit.BREAK
+        if (
+          token.offset == offset ||
+          (token.offset < offset && token.offset + token.source.length > offset)
+        ) {
+          res = token
+          return CST.visit.BREAK
+        }
+      }
+  })
+  return res
+}
+```
+
+#### `CST.stringify(cst: Token | CollectionItem): string`
+
+Stringify a CST document, token, or collection item.
+Fair warning: This applies no validation whatsoever, and simply concatenates the sources in their logical order.
+
+#### `CST.visit(cst: CST.Document | CST.CollectionItem, visitor: CSTVisitor)`
+
+Apply a visitor to a CST document or item.
+Effectively, the general-purpose workhorse of navigating the CST.
+
+Walks through the tree (depth-first) starting from `cst` as the root, calling a `visitor` function with two arguments when entering each item:
+
+- `item`: The current item, which includes the following members:
+  - `start: SourceToken[]` – Source tokens before the key or value, possibly including its anchor or tag.
+  - `key?: Token | null` – Set for pair values. May then be `null`, if the key before the `:` separator is empty.
+  - `sep?: SourceToken[]` – Source tokens between the key and the value, which should include the `:` map value indicator if `value` is set.
+  - `value?: Token` – The value of a sequence item, or of a map pair.
+- `path`: The steps from the root to the current node, as an array of `['key' | 'value', number]` tuples.
+
+The return value of the visitor may be used to control the traversal:
+
+- `undefined` (default): Do nothing and continue
+- `CST.visit.SKIP`: Do not visit the children of this token, continue with next sibling
+- `CST.visit.BREAK`: Terminate traversal completely
+- `CST.visit.REMOVE`: Remove the current item, then continue with the next one
+- `number`: Set the index of the next step. This is useful especially if the index of the current token has changed.
+- `function`: Define the next visitor for this item. After the original visitor is called on item entry, next visitors are called after handling a non-empty `key` and when exiting the item.
+
+<!-- prettier-ignore -->
+```js
+const [doc] = new Parser().parse('[ foo, bar, baz ]')
+CST.visit(doc, (item, path) => {
+  if (!CST.isScalar(item.value)) return
+  const scalar = CST.resolveAsScalar(item.value)
+  if (scalar?.value === 'bar') {
+    const parent = CST.visit.parentCollection(doc, path)
+    const idx = path[path.length - 1][1]
+    const { indent } = item.value
+    parent.items.splice(idx, 0, {
+      start: item.start.slice(),
+      value: CST.createScalarToken('bing', { end: [], indent })
+    })
+    return idx + 2
+  }
+})
+
+CST.stringify(doc)
+> '[ foo, bing, bar, baz ]'
+```
+
+A couple of utility functions are provided for working with the `path`:
+
+- `CST.visit.itemAtPath(cst, path): CST.CollectionItem | undefined` – Find the item at `path` from `cst` as the root.
+- `CST.visit.parentCollection(cst, path): CST.BlockMap | CST.BlockSequence | CST.FlowCollection` – Get the immediate parent collection of the item at `path` from `cst` as the root. Throws an error if the collection is not found, which should never happen if the item itself exists.
diff --git a/src/compose/compose-collection.ts b/src/compose/compose-collection.ts
@@ -2,11 +2,7 @@ import { isMap, isNode, ParsedNode } from '../nodes/Node.js'
 import { Scalar } from '../nodes/Scalar.js'
 import type { YAMLMap } from '../nodes/YAMLMap.js'
 import type { YAMLSeq } from '../nodes/YAMLSeq.js'
-import type {
-  BlockMap,
-  BlockSequence,
-  FlowCollection
-} from '../parse/tokens.js'
+import type { BlockMap, BlockSequence, FlowCollection } from '../parse/cst.js'
 import { CollectionTag } from '../schema/types.js'
 import type { ComposeContext, ComposeNode } from './compose-node.js'
 import type { ComposeErrorHandler } from './composer.js'

diff --git a/src/compose/compose-doc.ts b/src/compose/compose-doc.ts
@@ -5,7 +5,7 @@ import type {
   ParseOptions,
   SchemaOptions
 } from '../options.js'
-import type * as Tokens from '../parse/tokens.js'
+import type * as CST from '../parse/cst.js'
 import {
   ComposeContext,
   composeEmptyNode,
@@ -18,7 +18,7 @@ import { resolveProps } from './resolve-props.js'
 export function composeDoc(
   options: ParseOptions & DocumentOptions & SchemaOptions,
   directives: Directives,
-  { offset, start, value, end }: Tokens.Document,
+  { offset, start, value, end }: CST.Document,
   onError: ComposeErrorHandler
 ) {
   const opts = Object.assign({ directives }, options)

diff --git a/src/compose/compose-node.ts b/src/compose/compose-node.ts
@@ -2,7 +2,7 @@ import type { Directives } from '../doc/directives.js'
 import { Alias } from '../nodes/Alias.js'
 import type { ParsedNode } from '../nodes/Node.js'
 import type { ParseOptions } from '../options.js'
-import type { FlowScalar, Token } from '../parse/tokens.js'
+import type { FlowScalar, Token } from '../parse/cst.js'
 import type { Schema } from '../schema/Schema.js'
 import { composeCollection } from './compose-collection.js'
 import { composeScalar } from './compose-scalar.js'

diff --git a/src/compose/compose-scalar.ts b/src/compose/compose-scalar.ts
@@ -1,6 +1,6 @@
 import { isScalar, SCALAR } from '../nodes/Node.js'
 import { Scalar } from '../nodes/Scalar.js'
-import type { BlockScalar, FlowScalar } from '../parse/tokens.js'
+import type { BlockScalar, FlowScalar } from '../parse/cst.js'
 import type { Schema } from '../schema/Schema.js'
 import type { ScalarTag } from '../schema/types.js'
 import type { ComposeContext } from './compose-node.js'