Skip to content

Commit

Permalink
Generate ToUnicodeMap bfrange in multiple ranges (#1498) (#1499)
Browse files Browse the repository at this point in the history
* Generate ToUnicodeMap bfrange in multiple ranges (#1498)

This resolves #1498.

* Add unit test for bfrange lines in toUnicodeMap

* Add changelog line
  • Loading branch information
orzFly committed Feb 26, 2024
1 parent 485b7e6 commit 946f9cf
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 2 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -7,6 +7,7 @@
- Fix for soft hyphen not being replaced by visible hyphen if necessary (#457)
- Optimize output files by ignoring identity transforms
- Fix for Acroforms - setting an option to false will still apply the flag (#1495)
- Fix for text extraction in PDFium-based viewers due to invalid ToUnicodeMap (#1498)

### [v0.14.0] - 2023-11-09

Expand Down
11 changes: 10 additions & 1 deletion lib/font/embedded.js
Expand Up @@ -252,6 +252,15 @@ class EmbeddedFont extends PDFFont {
entries.push(`<${encoded.join(' ')}>`);
}

const chunkSize = 256;
const chunks = Math.ceil(entries.length / chunkSize);
const ranges = [];
for (let i = 0; i < chunks; i++) {
const start = i * chunkSize;
const end = Math.min((i + 1) * chunkSize, entries.length);
ranges.push(`<${toHex(start)}> <${toHex(end - 1)}> [${entries.slice(start, end).join(' ')}]`);
}

cmap.end(`\
/CIDInit /ProcSet findresource begin
12 dict begin
Expand All @@ -267,7 +276,7 @@ begincmap
<0000><ffff>
endcodespacerange
1 beginbfrange
<0000> <${toHex(entries.length - 1)}> [${entries.join(' ')}]
${ranges.join('\n')}
endbfrange
endcmap
CMapName currentdict /CMap defineresource pop
Expand Down
45 changes: 44 additions & 1 deletion tests/unit/font.spec.js
@@ -1,5 +1,6 @@
import PDFFontFactory from '../../lib/font_factory';
import PDFDocument from '../../lib/document';
import PDFFontFactory from '../../lib/font_factory';
import { logData } from './helpers';

describe('EmbeddedFont', () => {
test('no fontLayoutCache option', () => {
Expand Down Expand Up @@ -52,4 +53,46 @@ describe('EmbeddedFont', () => {
expect(dictionary.data.BaseFont).toBe('BAJJZZ+Roboto-Regular');
});
});

describe.only('toUnicodeMap', () => {
test('bfrange lines should not cross highcode boundary', () => {
const doc = new PDFDocument({ compress: false });
const font = PDFFontFactory.open(
doc,
'tests/fonts/Roboto-Regular.ttf',
undefined,
'F1099'
);

// 398 different glyphs
font.encode('ABCDEFGHIJKLMNOPQRSTUVWXYZ');
font.encode('abcdefghijklmnopqrstuvwxyz');
font.encode('ÁÀÂÄÅÃÆÇÐÉÈÊËÍÌÎÏÑÓÒÔÖÕØŒÞÚÙÛÜÝŸ');
font.encode('áàâäãåæçðéèêëíìîïıñóòôöõøœßþúùûüýÿ');
font.encode('ĀĂĄĆČĎĐĒĖĘĚĞĢĪĮİĶŁĹĻĽŃŅŇŌŐŔŖŘŠŚŞȘŢȚŤŪŮŰŲŽŹŻ');
font.encode('āăąćčďđēėęěğģīįķłĺļľńņňōőŕŗřšśşșţțťūůűųžźż');
font.encode('ΑΒΓ∆ΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΆΈΉΊΌΎΏΪΫ');
font.encode('αβγδεζηθικλµνξοπρςστυφχψωάέήίόύώϊϋΐΰ');
font.encode('АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ');
font.encode('абвгдежзийклмнопрстуфхцчшщъыьэюя');
font.encode('ЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏҐӁҒҖҚҢҮҰҲҶҺӘӢӨӮ');
font.encode('ѐёђѓєѕіїјљњћќѝўџґӂғҗқңүұҳҷһәӣөӯ');

const docData = logData(doc);
font.toUnicodeCmap();
const text = docData.map((d) => d.toString("utf8")).join("");

let glyphs = 0
for (const block of text.matchAll(/beginbfrange\n((?:.|\n)*?)\nendbfrange/g)) {
for (const line of block[1].matchAll(/^<([0-9a-f]+)>\s+<([0-9a-f]+)>\s+\[/igm)) {
const low = parseInt(line[1], 16);
const high = parseInt(line[2], 16);
glyphs += high - low + 1;
expect(high & 0xFFFFFF00).toBe(low & 0xFFFFFF00);
}
}

expect(glyphs).toBe(398 + 1);
});
});
});

0 comments on commit 946f9cf

Please sign in to comment.