Skip to content

Commit

Permalink
Choose diff-sequences for diff algorithm
Browse files Browse the repository at this point in the history
Summary:
Before this change, there are 3 Myers diff algorithms used in the dependency tree:
- diff-match-patch (1.0.5)
- diff (4.0.1)
- diff-sequences (via jest -> jest-diff -> diff-sequences)

We'd like to simplify the dependency tree. The short answer is:
- Use `diff-sequences`, or `jest-diff` which uses `diff-sequences` internally.

For best performance, do:
- Strip common prefix and suffix.
- Make line comparison O(1), avoid `line1 === line2` which can be O(line
  length).
- Consider skipping "cleanup" in `jest-diff` for long input.

----

Long answer of picking a diff library:

I wrote a benchmark script to get some idea about their performance:

    const fs = require('fs')
    const dmp = new (require('diff-match-patch').diff_match_patch)();
    const diff = require('diff');
    const ds = require('diff-sequences').default;
    const jd = require('jest-diff');

    dmp.Diff_Timeout = 120;

    // Diff functions. Output format: Chunk[]
    // Chunk is one of:
    // [0, n]: n common lines (same on both side)
    // [-1, n]: n left-side-only lines
    // [1, n]: n right-side-only lines

    function diff1(chars1, chars2) {
      return dmp.diff_main(chars1, chars2).map(v => [v[0], v[1].length]);
    }

    function diff1a(chars1, chars2) {
      return dmp.diff_main(chars1, chars2, false).map(v => [v[0], v[1].length]);
    }

    function diff2(chars1, chars2) {
      return diff.diffChars(chars1, chars2).map(v => {
        const d = v.added ? 1 : (v.removed ? -1 : 0);
        return [d, v.count];
      });
    }

    function diff3(chars1, chars2) {
      function isCommon(ai, bi) {
        return chars1[ai] == chars2[bi];
      }
      const r = [];
      let lastA = 0, lastB = 0;
      function foundSequence(n, na, nb) {
        if (na > lastA) {
          r.push([-1, na - lastA]);
          lastA = na;
        }
        if (nb > lastB) {
          r.push([1, nb - lastB]);
          lastB = nb;
        }
        if (n > 0) {
          r.push([0, n]);
          lastA += n;
          lastB += n;
        }
      }
      ds(chars1.length, chars2.length, isCommon, foundSequence);
      foundSequence(0, chars1.length, chars2.length);
      return r;
    }

    function diff3a(chars1, chars2) {
      return jd.diffStringsRaw(chars1, chars2, false).map((d) => [d[0], d[1].length]);
    }

    function diff3b(chars1, chars2) {
      return jd.diffStringsRaw(chars1, chars2, true).map((d) => [d[0], d[1].length]);
    }

    function bench(a, b) {
      const {chars1, chars2} = dmp.diff_linesToChars_(a, b);
      function stringify(obj) {
        if (obj.length > 20) {
          return `${obj.length} items`;
        } else {
          return JSON.stringify(obj);
        }
      }

      [
        ['diff-match-patch', diff1],
        ['diff-match-patch (checklines=false)', diff1a],
        ['diff-sequences', diff3],
        ['jest-diff (diff-sequences), no cleanup', diff3a],
        ['jest-diff (diff-sequences), with cleanup', diff3b],
        ['jsdiff', diff2],
      ].forEach(([name, diffFunc]) => {
        // node --expose_gc
        if (global.gc) {
          gc();
        }
        const label = `  ${name}`;
        console.time(label);
        console.log(' ', stringify(diffFunc(chars1, chars2)));
        console.timeEnd(label);
      });
    }

    let a, b;

    console.log('\nwith common prefix and suffix 1');
    a = 'aaaaaaa\n'.repeat(50000) + 'bbbb\n' + 'dddd\n'.repeat(50000);
    b = 'aaaaaaa\n'.repeat(50000) + 'cccc\n' + 'dddd\n'.repeat(50000);
    bench(a, b);

    console.log('\nwith common prefix and suffix 2');
    a = 'aaaaaaa\n'.repeat(50000) + 'bbbbbbb\n' + 'dddd\n'.repeat(50000);
    b = 'aaaaaaa\n'.repeat(50100) + 'cccc\n' + 'dddd\n'.repeat(49900);
    bench(a, b);

    console.log('\nwithout common prefix or suffix 1');
    a = 'c\n' + 'aaaaaaa\n'.repeat(50000) + 'dddd\n'.repeat(50000);
    b = 'aaaaaaa\n'.repeat(50000) + 'dddd\n'.repeat(50100) + 'z\n';
    bench(a, b);

    console.log('\nwithout common prefix or suffix 2');
    a = 'cccc\n' + 'aaaaaaa\n'.repeat(50000) + 'bbbbbbb\n' + 'dddd\n'.repeat(50000) + 'z\n';
    b = 'aaaaaaa\n'.repeat(50100) + 'cccc\n' + 'dddd\n'.repeat(49900) + 'z\ny\n';
    bench(a, b);

    // Hearthstone cards.json in different languages.
    // This is somewhat challenging since many lines are changed.
    // wget https://api.hearthstonejson.com/v1/168129/enUS/cards.json -O 1
    // wget https://api.hearthstonejson.com/v1/168129/zhCN/cards.json -O 2
    // python3 -m json.tool < 1 > 1.json
    // python3 -m json.tool < 2 > 2.json
    console.log('\ncards.json with different languages');
    a = fs.readFileSync('1.json', {encoding: 'utf-8'});
    b = fs.readFileSync('2.json', {encoding: 'utf-8'});
    bench(a, b);

The output looks like:

    with common prefix and suffix 1
      [[0,50000],[-1,1],[1,1],[0,50000]]
      diff-match-patch: 5.073ms
      [[0,50000],[-1,1],[1,1],[0,50000]]
      diff-match-patch (checklines=false): 0.481ms
      [[0,50000],[-1,1],[1,1],[0,50000]]
      diff-sequences: 7.589ms
      [[0,50000],[-1,1],[1,1],[0,50000]]
      jest-diff (diff-sequences), no cleanup: 10.915ms
      [[0,50000],[-1,1],[1,1],[0,50000]]
      jest-diff (diff-sequences), with cleanup: 10.588ms
      [[0,50000],[-1,1],[1,1],[0,50000]]
      jsdiff: 22.664ms

    with common prefix and suffix 2
      [[0,50000],[-1,101],[1,101],[0,49900]]
      diff-match-patch: 10.688ms
      [[0,50000],[-1,101],[1,101],[0,49900]]
      diff-match-patch (checklines=false): 2.619ms
      [[0,50000],[-1,101],[1,101],[0,49900]]
      diff-sequences: 12.687ms
      [[0,50000],[-1,101],[1,101],[0,49900]]
      jest-diff (diff-sequences), no cleanup: 11.055ms
      [[0,50000],[-1,101],[1,101],[0,49900]]
      jest-diff (diff-sequences), with cleanup: 4.356ms
      [[0,50000],[-1,1],[1,101],[0,49900],[-1,100]]
      jsdiff: 59.359ms

    without common prefix or suffix 1
      [[-1,1],[0,100000],[1,101]]
      diff-match-patch: 632.863ms
      [[-1,1],[0,100000],[1,101]]
      diff-match-patch (checklines=false): 607.796ms
      [[-1,1],[0,50000],[1,51],[0,50000],[1,50]]
      diff-sequences: 12.366ms
      [[-1,1],[0,50000],[1,51],[0,50000],[1,50]]
      jest-diff (diff-sequences), no cleanup: 11.096ms
      [[-1,1],[0,100000],[1,51],[1,50]]
      jest-diff (diff-sequences), with cleanup: 1.029s
      [[-1,1],[0,100000],[1,101]]
      jsdiff: 13.163ms

    without common prefix or suffix 2
      [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]]
      diff-match-patch: 2.773s
      [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]]
      diff-match-patch (checklines=false): 1.402s
      [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]]
      diff-sequences: 22.216ms
      [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]]
      jest-diff (diff-sequences), no cleanup: 20.546ms
      [[-1,1],[0,50000],[-1,101],[1,101],[0,49901],[1,1]]
      jest-diff (diff-sequences), with cleanup: 19.222ms
      [[-1,1],[0,50000],[-1,1],[1,101],[0,49900],[-1,100],[0,1],[1,1]]
      jsdiff: 33.82ms

    cards.json with different languages
      67781 items
      diff-match-patch: 1:04.122 (m:ss.mmm)
      57514 items
      diff-match-patch (checklines=false): 2:00.283 (m:ss.mmm)
      67781 items
      diff-sequences: 1:09.486 (m:ss.mmm)
      67781 items
      jest-diff (diff-sequences), no cleanup: 1:06.452 (m:ss.mmm)
      52937 items
      jest-diff (diff-sequences), with cleanup: 1:09.118 (m:ss.mmm)
      ...
      (jsdiff cannot complete this test case in 20+ minutes)

Observations:
- In the last test case, `jsdiff` does not implement O(D^2) -> O(D) space
  optimization so it is practically unusable (reported as kpdecker/jsdiff#396).
  `diff-match-patch` and `jest-diff` both implement the linear space
  optimization, and have similar performance.
- `diff-match-patch` strips common prefix and suffix, which makes it faster
  than `jest-diff` in "common prefix and suffix" test cases.
- Both `diff-match-patch` and `jest-diff` can take a long time on "cleanup".
  See the "without common prefix or suffix 1" test case. We probably want
  to only enable cleanup for smaller input.
- `diff-match-patch` performs visibly worse on the "without common prefix
  or suffix 2" test case. From the code it looks like `diff-match-patch` uses
  some kind of heuristics that tries to speed up things but ends up slowing it
  down.
- Without cleanup, `jest-diff` might output `[1,51],[1,50]` that can be
  "obviously" merged to `[1,101]`. We might use a lightweight cleanup logic
  for that.
- Reading the code, `diff-match-patch` turns lines into char codes. It cannot
  handle 65536 unique lines.
  (https://github.com/google/diff-match-patch/blob/62f2e689f498f9c92dbc588c58750addec9b1654/javascript/diff_match_patch_uncompressed.js#L503)

Conclusions:
- `jest-diff` (and `diff-sequences` under the hood) is overall the best choice.
  It has expected time and space complexities, and provides flexibility to skip
  the potentially slow "cleanup", and can support >65k unique lines.
- `jest-diff` misses the "skip common prefix / suffix" optimization that
  `diff-match-patch` has, and seems practically important (editing a line in
  the editor - all lines are common prefixes and suffixes except for the line
  being edited). The optimization is not hard to implement. This diff
  implements it.
- For certain use-cases (ex. linelog) where the diff content is not needed
  (at least for the left / "a" side), it should use `diff-sequences` to avoid
  overhead preparing the diff content.
  - `jest-diff`'s `diffLines` outputs one line per `Diff` but we want
    one chunk per `Diff`.
  - `jest-diff`'s `diffStringsRaw` produces one `Diff` per chunk, and because
    [`string.slice` is O(1) in V8](https://stackoverflow.com/a/72545403), it has
    acceptable performance. But mapping lines to chars would introduce the
    65535 unique line limit undesirably.

Reviewed By: evangrayk

Differential Revision: D43857949

fbshipit-source-id: 9a3d85ebf10c9b82da8ab5cba4e14e519bbf264d
  • Loading branch information
quark-zju authored and facebook-github-bot committed Mar 7, 2023
1 parent 41391d8 commit bce27b2
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 53 deletions.
5 changes: 1 addition & 4 deletions addons/isl/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,12 @@
"@testing-library/jest-dom": "^5.14.1",
"@testing-library/react": "^13.0.0",
"@testing-library/user-event": "^13.2.1",
"@types/diff": "^5.0.2",
"@types/jest": "^27.0.1",
"@types/node": "^16.7.13",
"@types/react": "^18.0.0",
"@types/react-dom": "^18.0.0",
"@vscode/webview-ui-toolkit": "^1.0.0",
"diff": "^5.0.0",
"diff-match-patch": "^1.0.5",
"diff-sequences": "^29.4.3",
"isl-server": "0.0.0",
"react": "^18.1.0",
"react-dom": "^18.1.0",
Expand All @@ -23,7 +21,6 @@
"typescript": "^4.4.2"
},
"devDependencies": {
"@types/diff-match-patch": "^1.0.32",
"rewire": "^6.0.0",
"ts-loader": "^9.3.1"
},
Expand Down
113 changes: 74 additions & 39 deletions addons/isl/src/linelog.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,8 @@ SOFTWARE.
*/

import {diff_match_patch} from 'diff-match-patch';

const dmp = new diff_match_patch();

// The timeout does not seem to affect dmp performance.
// But bumping it produces better diff results.
dmp.Diff_Timeout = 10000;
// Read D43857949 about the choice of the diff library.
import diffSequences from 'diff-sequences';

/** Operation code. */
enum Op {
Expand Down Expand Up @@ -400,13 +395,13 @@ class LineLog {
const [aRev, bRev] = rev ? [rev, rev] : [this.maxRev, this.maxRev + 1];
const b = text;

const lines = splitLines(b);
const bLines = splitLines(b);
this.checkOut(aRev);
const a = this.content;
const blocks = diffLines(a, b);
const aLines = splitLines(this.content);
const blocks = diffLines(aLines, bLines);

blocks.reverse().forEach(([a1, a2, b1, b2]) => {
this.editChunk(a1, a2, bRev, lines.slice(b1, b2));
this.editChunk(a1, a2, bRev, bLines.slice(b1, b2));
});
this.content = b;
this.lastCheckoutKey = `${bRev},null`;
Expand All @@ -429,41 +424,59 @@ class LineLog {
}

/**
* Calculate the differences.
* Calculate the line differences. For performance, this function only
* returns the line indexes for different chunks. The line contents
* are not returned.
*
* @param a Content of the "a" side.
* @param b Content of the "b" side.
* @param aLines lines on the "a" side.
* @param bLines lines on the "b" side.
* @returns A list of `(a1, a2, b1, b2)` tuples for the line ranges that
* are different between "a" and "b".
*/
function diffLines(a: string, b: string): [LineIdx, LineIdx, LineIdx, LineIdx][] {
const {chars1, chars2} = dmp.diff_linesToChars_(a, b);
function diffLines(aLines: string[], bLines: string[]): [LineIdx, LineIdx, LineIdx, LineIdx][] {
// Avoid O(string length) comparison.
const [aList, bList] = stringsToInts([aLines, bLines]);

// Skip common prefix and suffix.
let aLen = aList.length;
let bLen = bList.length;
const minLen = Math.min(aLen, bLen);
let commonPrefixLen = 0;
while (commonPrefixLen < minLen && aList[commonPrefixLen] === bList[commonPrefixLen]) {
commonPrefixLen += 1;
}
while (aLen > commonPrefixLen && bLen > commonPrefixLen && aList[aLen - 1] === bList[bLen - 1]) {
aLen -= 1;
bLen -= 1;
}
aLen -= commonPrefixLen;
bLen -= commonPrefixLen;

// Run the diff algorithm.
const blocks: [LineIdx, LineIdx, LineIdx, LineIdx][] = [];
let a1 = 0,
a2 = 0,
b1 = 0,
b2 = 0;
const push = (len: number) => {
let a1 = 0;
let b1 = 0;

function isCommon(aIndex: number, bIndex: number) {
return aList[aIndex + commonPrefixLen] === bList[bIndex + commonPrefixLen];
}

function foundSequence(n: LineIdx, a2: LineIdx, b2: LineIdx) {
if (a1 !== a2 || b1 !== b2) {
blocks.push([a1, a2, b1, b2]);
}
a1 = a2 = a2 + len;
b1 = b2 = b2 + len;
};
dmp.diff_main(chars1, chars2, false).forEach(x => {
const [op, chars] = x;
const len = chars.length;
if (op === 0) {
push(len);
}
if (op < 0) {
a2 += len;
blocks.push([
a1 + commonPrefixLen,
a2 + commonPrefixLen,
b1 + commonPrefixLen,
b2 + commonPrefixLen,
]);
}
if (op > 0) {
b2 += len;
}
});
push(0);
a1 = a2 + n;
b1 = b2 + n;
}

diffSequences(aLen, bLen, isCommon, foundSequence);
foundSequence(0, aLen, bLen);

return blocks;
}

Expand All @@ -485,6 +498,28 @@ function splitLines(s: string): string[] {
return result;
}

/**
* Make strings with the same content use the same integer
* for fast comparasion.
*/
function stringsToInts(linesArray: string[][]): number[][] {
// This is similar to diff-match-patch's diff_linesToChars_ but is not
// limited to 65536 unique lines.
const lineMap = new Map<string, number>();
return linesArray.map(lines =>
lines.map(line => {
const existingId = lineMap.get(line);
if (existingId != null) {
return existingId;
} else {
const id = lineMap.size;
lineMap.set(line, id);
return id;
}
}),
);
}

/** If the assertion fails, throw an `Error` with the given `message`. */
function assert(condition: boolean, message: string) {
if (!condition) {
Expand Down
15 changes: 5 additions & 10 deletions addons/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2517,11 +2517,6 @@
dependencies:
"@types/node" "*"

"@types/diff-match-patch@^1.0.32":
version "1.0.32"
resolved "https://registry.yarnpkg.com/@types/diff-match-patch/-/diff-match-patch-1.0.32.tgz#d9c3b8c914aa8229485351db4865328337a3d09f"
integrity sha512-bPYT5ECFiblzsVzyURaNhljBH2Gh1t9LowgUwciMrNAhFewLkHT2H0Mto07Y4/3KCOGZHRQll3CTtQZ0X11D/A==

"@types/diff@^5.0.2":
version "5.0.2"
resolved "https://registry.yarnpkg.com/@types/diff/-/diff-5.0.2.tgz#dd565e0086ccf8bc6522c6ebafd8a3125c91c12b"
Expand Down Expand Up @@ -4813,11 +4808,6 @@ didyoumean@^1.2.2:
resolved "https://registry.yarnpkg.com/didyoumean/-/didyoumean-1.2.2.tgz#989346ffe9e839b4555ecf5666edea0d3e8ad037"
integrity sha512-gxtyfqMg7GKyhQmb056K7M3xszy/myH8w+B4RT+QXBQsvAOdc3XymqDDPHx1BgPgsdAA5SIifona89YtRATDzw==

diff-match-patch@^1.0.5:
version "1.0.5"
resolved "https://registry.yarnpkg.com/diff-match-patch/-/diff-match-patch-1.0.5.tgz#abb584d5f10cd1196dfc55aa03701592ae3f7b37"
integrity sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw==

diff-sequences@^27.5.1:
version "27.5.1"
resolved "https://registry.yarnpkg.com/diff-sequences/-/diff-sequences-27.5.1.tgz#eaecc0d327fd68c8d9672a1e64ab8dccb2ef5327"
Expand All @@ -4828,6 +4818,11 @@ diff-sequences@^28.1.1:
resolved "https://registry.yarnpkg.com/diff-sequences/-/diff-sequences-28.1.1.tgz#9989dc731266dc2903457a70e996f3a041913ac6"
integrity sha512-FU0iFaH/E23a+a718l8Qa/19bF9p06kgE0KipMOMadwa3SjnaElKzPaUC0vnibs6/B/9ni97s61mcejk8W1fQw==

diff-sequences@^29.4.3:
version "29.4.3"
resolved "https://registry.yarnpkg.com/diff-sequences/-/diff-sequences-29.4.3.tgz#9314bc1fabe09267ffeca9cbafc457d8499a13f2"
integrity sha512-ofrBgwpPhCD85kMKtE9RYFFq6OC1A89oW2vvgWZNCwxrUpRUILopY7lsYyMDSjc8g6U6aiO0Qubg6r4Wgt5ZnA==

diff@^4.0.1:
version "4.0.2"
resolved "https://registry.yarnpkg.com/diff/-/diff-4.0.2.tgz#60f3aecb89d5fae520c11aa19efc2bb982aade7d"
Expand Down

0 comments on commit bce27b2

Please sign in to comment.