kpdecker · jongaull-nimbly · Sep 17, 2019 · Sep 17, 2019 · ExplodingCabbage · Dec 15, 2023
diff --git a/README.md b/README.md
@@ -73,7 +73,9 @@ npm install diff --save
     * `newStr` : New string value
     * `oldHeader` : Additional information to include in the old file header
     * `newHeader` : Additional information to include in the new file header
-    * `options` : An object with options. Currently, only `context` is supported and describes how many lines of context should be included.
+    * `options` : An object with options.
+        * `context` : describes how many lines of context should be included.
+        * `tokenizer` : Overrides the default regex used to split text into words. supported by `diffWords` and `diffWordsWithSpace`
 
 * `Diff.createPatch(fileName, oldStr, newStr, oldHeader, newHeader)` - creates a unified diff patch.
 

diff --git a/src/diff/word.js b/src/diff/word.js
@@ -32,7 +32,8 @@ wordDiff.equals = function(left, right) {
   return left === right || (this.options.ignoreWhitespace && !reWhitespace.test(left) && !reWhitespace.test(right));
 };
 wordDiff.tokenize = function(value) {
-  let tokens = value.split(/(\s+|[()[\]{}'"]|\b)/);
+  const tokenizer = this.options.tokenizer || /(\s+|[()[\]{}'"]|\b)/; // Use the tokenizer regex in the options or use the default regex
+  const tokens = value.split(tokenizer); // Join the boundary splits that we do not consider to be boundaries. This is primarily the extended Latin character set.
 
   // Join the boundary splits that we do not consider to be boundaries. This is primarily the extended Latin character set.
   for (let i = 0; i < tokens.length - 1; i++) {

diff --git a/test/diff/word.js b/test/diff/word.js
@@ -171,6 +171,29 @@ describe('WordDiff', function() {
         done();
       });
     });
+
+    // With custom tokenizer
+    it('should utilize a custom tokenizer', function() {
+
+      const diff = diffWords('foo_bar', 'something_bar', {
+        tokenizer: /(\s+|[()[\]{}_'"]|\b)/
+      });
+
+      expect(diff).to.eql([{
+        count: 1,
+        added: undefined,
+        removed: true,
+        value: 'foo'
+      }, {
+        count: 1,
+        added: true,
+        removed: undefined,
+        value: 'something'
+      }, {
+        count: 2,
+        value: '_bar'
+      }]);
+    });
   });
 
   describe('#diffWordsWithSpace', function() {