From f5b232c44026df5392305c3560c92720ffa068da Mon Sep 17 00:00:00 2001 From: FredrikSchaefer <67001822+FredrikSchaefer@users.noreply.github.com> Date: Fri, 10 Nov 2023 20:13:04 +0100 Subject: [PATCH] Support adding custom detectors (#603) Co-authored-by: Sindre Sorhus Co-authored-by: Borewit --- core.d.ts | 90 +++++++++++++++++++++- core.js | 155 ++++++++++++++++++++++--------------- fixture/fixture.unicorn | 1 + index.js | 7 +- readme.md | 60 +++++++++++++++ test.js | 166 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 415 insertions(+), 64 deletions(-) create mode 100644 fixture/fixture.unicorn diff --git a/core.d.ts b/core.d.ts index 1f3cf074..37f06266 100644 --- a/core.d.ts +++ b/core.d.ts @@ -421,7 +421,10 @@ if (stream2.fileType?.mime === 'image/jpeg') { export function fileTypeStream(readableStream: ReadableStream, options?: StreamOptions): Promise; /** -Detect the file type of a [`Blob`](https://nodejs.org/api/buffer.html#class-blob). +Detect the file type of a [`Blob`](https://nodejs.org/api/buffer.html#class-blob) or [`File`](https://developer.mozilla.org/en-US/docs/Web/API/File). + +@param blob [`Blob`](https://nodejs.org/api/buffer.html#class-blob) used for file detection +@returns The detected file type and MIME type, or `undefined` when there is no match. @example ``` @@ -437,3 +440,88 @@ console.log(await fileTypeFromBlob(blob)); ``` */ export declare function fileTypeFromBlob(blob: Blob): Promise; + +/** +Function that allows specifying custom detection mechanisms. + +An iterable of detectors can be provided via the `fileTypeOptions` argument for the {@link FileTypeParser.constructor}. + +The detectors are called before the default detections in the provided order. + +Custom detectors can be used to add new FileTypeResults or to modify return behaviour of existing FileTypeResult detections. + +If the detector returns `undefined`, there are 2 possible scenarios: + + 1. The detector has not read from the tokenizer, it will be proceeded with the next available detector. + 2. The detector has read from the tokenizer (`tokenizer.position` has been increased). + In that case no further detectors will be executed and the final conclusion is that file-type returns undefined. + Note that this an exceptional scenario, as the detector takes the opportunity from any other detector to determine the file type. + +Example detector array which can be extended and provided via the fileTypeOptions argument: + +``` +import {FileTypeParser} from 'file-type'; + +const customDetectors = [ + async tokenizer => { + const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string + const buffer = Buffer.alloc(7); + await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true}); + if (unicornHeader.every((value, index) => value === buffer[index])) { + return {ext: 'unicorn', mime: 'application/unicorn'}; + } + + return undefined; + }, +]; + +const buffer = Buffer.from("UNICORN"); +const parser = new FileTypeParser({customDetectors}); +const fileType = await parser.fromBuffer(buffer); +console.log(fileType); +``` + +@param tokenizer - [Tokenizer](https://github.com/Borewit/strtok3#tokenizer), used to read the file content from. +@param fileType - FileTypeResult detected by the standard detections or a previous custom detection. Undefined if no matching fileTypeResult could be found. +@returns supposedly detected file extension and MIME type as a FileTypeResult-like object, or `undefined` when there is no match. +*/ +export type Detector = (tokenizer: ITokenizer, fileType?: FileTypeResult) => Promise; + +export type FileTypeOptions = { + customDetectors?: Iterable; +}; + +export declare class TokenizerPositionError extends Error { + constructor(message?: string); +} + +export declare class FileTypeParser { + detectors: Iterable; + + constructor(options?: {customDetectors?: Iterable}); + + /** + Works the same way as {@link fileTypeFromBuffer}, additionally taking into account custom detectors (if any were provided to the constructor). + */ + fromBuffer(buffer: Uint8Array | ArrayBuffer): Promise; + + /** + Works the same way as {@link fileTypeFromStream}, additionally taking into account custom detectors (if any were provided to the constructor). + */ + fromStream(stream: ReadableStream): Promise; + + /** + Works the same way as {@link fileTypeFromTokenizer}, additionally taking into account custom detectors (if any were provided to the constructor). + */ + fromTokenizer(tokenizer: ITokenizer): Promise; + + /** + Works the same way as {@link fileTypeFromBlob}, additionally taking into account custom detectors (if any were provided to the constructor). + */ + fromBlob(blob: Blob): Promise; + + /** + Works the same way as {@link fileTypeStream}, additionally taking into account custom detectors (if any were provided to the constructor). + */ + toDetectionStream(readableStream: ReadableStream, options?: StreamOptions): Promise; +} diff --git a/core.js b/core.js index 8145965b..01dcb11e 100644 --- a/core.js +++ b/core.js @@ -11,31 +11,15 @@ import {extensions, mimeTypes} from './supported.js'; const minimumBytes = 4100; // A fair amount of file-types are detectable within this range. export async function fileTypeFromStream(stream) { - const tokenizer = await strtok3.fromStream(stream); - try { - return await fileTypeFromTokenizer(tokenizer); - } finally { - await tokenizer.close(); - } + return new FileTypeParser().fromStream(stream); } export async function fileTypeFromBuffer(input) { - if (!(input instanceof Uint8Array || input instanceof ArrayBuffer)) { - throw new TypeError(`Expected the \`input\` argument to be of type \`Uint8Array\` or \`Buffer\` or \`ArrayBuffer\`, got \`${typeof input}\``); - } - - const buffer = input instanceof Uint8Array ? input : new Uint8Array(input); - - if (!(buffer?.length > 1)) { - return; - } - - return fileTypeFromTokenizer(strtok3.fromBuffer(buffer)); + return new FileTypeParser().fromBuffer(input); } export async function fileTypeFromBlob(blob) { - const buffer = await blob.arrayBuffer(); - return fileTypeFromBuffer(new Uint8Array(buffer)); + return new FileTypeParser().fromBlob(blob); } function _check(buffer, headers, options) { @@ -60,16 +44,98 @@ function _check(buffer, headers, options) { } export async function fileTypeFromTokenizer(tokenizer) { - try { - return new FileTypeParser().parse(tokenizer); - } catch (error) { - if (!(error instanceof strtok3.EndOfStreamError)) { - throw error; + return new FileTypeParser().fromTokenizer(tokenizer); +} + +export class FileTypeParser { + constructor(options) { + this.detectors = options?.customDetectors; + + this.fromTokenizer = this.fromTokenizer.bind(this); + this.fromBuffer = this.fromBuffer.bind(this); + this.parse = this.parse.bind(this); + } + + async fromTokenizer(tokenizer) { + const initialPosition = tokenizer.position; + + for (const detector of this.detectors || []) { + const fileType = await detector(tokenizer); + if (fileType) { + return fileType; + } + + if (initialPosition !== tokenizer.position) { + return undefined; // Cannot proceed scanning of the tokenizer is at an arbitrary position + } } + + return this.parse(tokenizer); + } + + async fromBuffer(input) { + if (!(input instanceof Uint8Array || input instanceof ArrayBuffer)) { + throw new TypeError(`Expected the \`input\` argument to be of type \`Uint8Array\` or \`Buffer\` or \`ArrayBuffer\`, got \`${typeof input}\``); + } + + const buffer = input instanceof Uint8Array ? input : new Uint8Array(input); + + if (!(buffer?.length > 1)) { + return; + } + + return this.fromTokenizer(strtok3.fromBuffer(buffer)); + } + + async fromBlob(blob) { + const buffer = await blob.arrayBuffer(); + return this.fromBuffer(new Uint8Array(buffer)); + } + + async fromStream(stream) { + const tokenizer = await strtok3.fromStream(stream); + try { + return await this.fromTokenizer(tokenizer); + } finally { + await tokenizer.close(); + } + } + + async toDetectionStream(readableStream, options = {}) { + const {default: stream} = await import('node:stream'); + const {sampleSize = minimumBytes} = options; + + return new Promise((resolve, reject) => { + readableStream.on('error', reject); + + readableStream.once('readable', () => { + (async () => { + try { + // Set up output stream + const pass = new stream.PassThrough(); + const outputStream = stream.pipeline ? stream.pipeline(readableStream, pass, () => {}) : readableStream.pipe(pass); + + // Read the input stream and detect the filetype + const chunk = readableStream.read(sampleSize) ?? readableStream.read() ?? Buffer.alloc(0); + try { + pass.fileType = await this.fromBuffer(chunk); + } catch (error) { + if (error instanceof strtok3.EndOfStreamError) { + pass.fileType = undefined; + } else { + reject(error); + } + } + + resolve(outputStream); + } catch (error) { + reject(error); + } + })(); + }); + }); } -} -class FileTypeParser { check(header, options) { return _check(this.buffer, header, options); } @@ -211,7 +277,7 @@ class FileTypeParser { } await tokenizer.ignore(id3HeaderLength); - return fileTypeFromTokenizer(tokenizer); // Skip ID3 header, recursion + return this.fromTokenizer(tokenizer); // Skip ID3 header, recursion } // Musepack, SV7 @@ -1609,39 +1675,8 @@ class FileTypeParser { } } -export async function fileTypeStream(readableStream, {sampleSize = minimumBytes} = {}) { - const {default: stream} = await import('node:stream'); - - return new Promise((resolve, reject) => { - readableStream.on('error', reject); - - readableStream.once('readable', () => { - (async () => { - try { - // Set up output stream - const pass = new stream.PassThrough(); - const outputStream = stream.pipeline ? stream.pipeline(readableStream, pass, () => {}) : readableStream.pipe(pass); - - // Read the input stream and detect the filetype - const chunk = readableStream.read(sampleSize) ?? readableStream.read() ?? Buffer.alloc(0); - try { - const fileType = await fileTypeFromBuffer(chunk); - pass.fileType = fileType; - } catch (error) { - if (error instanceof strtok3.EndOfStreamError) { - pass.fileType = undefined; - } else { - reject(error); - } - } - - resolve(outputStream); - } catch (error) { - reject(error); - } - })(); - }); - }); +export async function fileTypeStream(readableStream, options = {}) { + return new FileTypeParser().toDetectionStream(readableStream, options); } export const supportedExtensions = new Set(extensions); diff --git a/fixture/fixture.unicorn b/fixture/fixture.unicorn new file mode 100644 index 00000000..003240e1 --- /dev/null +++ b/fixture/fixture.unicorn @@ -0,0 +1 @@ +UNICORN FILE CONTENT diff --git a/index.js b/index.js index 8d0537ea..24bacf9d 100644 --- a/index.js +++ b/index.js @@ -1,10 +1,11 @@ import * as strtok3 from 'strtok3'; -import {fileTypeFromTokenizer} from './core.js'; +import {FileTypeParser} from './core.js'; -export async function fileTypeFromFile(path) { +export async function fileTypeFromFile(path, fileTypeOptions) { const tokenizer = await strtok3.fromFile(path); try { - return await fileTypeFromTokenizer(tokenizer); + const parser = new FileTypeParser(fileTypeOptions); + return await parser.fromTokenizer(tokenizer); } finally { await tokenizer.close(); } diff --git a/readme.md b/readme.md index 70f6aa92..0451bbe5 100644 --- a/readme.md +++ b/readme.md @@ -189,6 +189,10 @@ console.log(await fileTypeFromBlob(blob)); //=> {ext: 'txt', mime: 'plain/text'} ``` +#### blob + +Type: [`Blob`](https://developer.mozilla.org/en-US/docs/Web/API/Blob) + ### fileTypeFromTokenizer(tokenizer) Detect the file type from an `ITokenizer` source. @@ -305,6 +309,48 @@ Returns a `Set` of supported file extensions. Returns a `Set` of supported MIME types. +## Custom detectors + +A custom detector is a function that allows specifying custom detection mechanisms. + +An iterable of detectors can be provided via the `fileTypeOptions` argument for the `FileTypeParser.constructor`. + +The detectors are called before the default detections in the provided order. + +Custom detectors can be used to add new `FileTypeResults` or to modify return behaviour of existing FileTypeResult detections. + +If the detector returns `undefined`, there are 2 possible scenarios: + +1. The detector has not read from the tokenizer, it will be proceeded with the next available detector. +2. The detector has read from the tokenizer (`tokenizer.position` has been increased). + In that case no further detectors will be executed and the final conclusion is that file-type returns undefined. + Note that this an exceptional scenario, as the detector takes the opportunity from any other detector to determine the file type. + + +Example detector array which can be extended and provided to each public method via the `fileTypeOptions` argument: +```js +import {FileTypeParser} from 'file-type'; + +const customDetectors = [ + async tokenizer => { + const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string + const buffer = Buffer.alloc(7); + await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true}); + + if (unicornHeader.every((value, index) => value === buffer[index])) { + return {ext: 'unicorn', mime: 'application/unicorn'}; + } + + return undefined; + }, +]; + +const buffer = Buffer.from("UNICORN"); +const parser = new FileTypeParser({customDetectors}); +const fileType = await parser.fromBuffer(buffer); +console.log(fileType); +``` + ## Supported file types - [`3g2`](https://en.wikipedia.org/wiki/3GP_and_3G2#3G2) - Multimedia container format defined by the 3GPP2 for 3G CDMA2000 multimedia services @@ -470,6 +516,20 @@ The following file types will not be accepted: - `.csv` - [Reason.](https://github.com/sindresorhus/file-type/issues/264#issuecomment-568439196) - `.svg` - Detecting it requires a full-blown parser. Check out [`is-svg`](https://github.com/sindresorhus/is-svg) for something that mostly works. +#### tokenizer + +Type: [`ITokenizer`](https://github.com/Borewit/strtok3#tokenizer) + +Usable as source of the examined file. + +#### fileType + +Type: FileTypeResult + +Object having an `ext` (extension) and `mime` (mime type) property. + +Detected by the standard detections or a previous custom detection. Undefined if no matching fileTypeResult could be found. + ## Related - [file-type-cli](https://github.com/sindresorhus/file-type-cli) - CLI for this module diff --git a/test.js b/test.js index 08b7de3b..078d5a0b 100644 --- a/test.js +++ b/test.js @@ -7,11 +7,13 @@ import stream from 'node:stream'; import test from 'ava'; import {readableNoopStream} from 'noop-stream'; import {Parser as ReadmeParser} from 'commonmark'; +import * as strtok3 from 'strtok3/core'; // eslint-disable-line n/file-extension-in-import import { fileTypeFromBuffer, fileTypeFromStream, fileTypeFromFile, fileTypeFromBlob, + FileTypeParser, fileTypeStream, supportedExtensions, supportedMimeTypes, @@ -662,3 +664,167 @@ test('corrupt MKV throws', async t => { const filePath = path.join(__dirname, 'fixture/fixture-corrupt.mkv'); await t.throwsAsync(fileTypeFromFile(filePath), {message: /out of range/}); }); + +// Create a custom detector for the just made up "unicorn" file type +const unicornDetector = async tokenizer => { + const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string + const buffer = Buffer.alloc(7); + await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true}); + if (unicornHeader.every((value, index) => value === buffer[index])) { + return {ext: 'unicorn', mime: 'application/unicorn'}; + } + + return undefined; +}; + +const mockPngDetector = _tokenizer => ({ext: 'mockPng', mime: 'image/mockPng'}); + +const tokenizerPositionChanger = tokenizer => { + const buffer = Buffer.alloc(1); + tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true}); +}; + +test('fileTypeFromBlob should detect custom file type "unicorn" using custom detectors', async t => { + // Set up the "unicorn" file content + const header = 'UNICORN FILE\n'; + const blob = new Blob([header]); + + const customDetectors = [unicornDetector]; + const parser = new FileTypeParser({customDetectors}); + + const result = await parser.fromBlob(blob); + t.deepEqual(result, {ext: 'unicorn', mime: 'application/unicorn'}); +}); + +test('fileTypeFromBlob should keep detecting default file types when no custom detector matches', async t => { + const file = path.join(__dirname, 'fixture', 'fixture.png'); + const chunk = fs.readFileSync(file); + const blob = new Blob([chunk]); + + const customDetectors = [unicornDetector]; + const parser = new FileTypeParser({customDetectors}); + + const result = await parser.fromBlob(blob); + t.deepEqual(result, {ext: 'png', mime: 'image/png'}); +}); + +test('fileTypeFromBlob should allow overriding default file type detectors', async t => { + const file = path.join(__dirname, 'fixture', 'fixture.png'); + const chunk = fs.readFileSync(file); + const blob = new Blob([chunk]); + + const customDetectors = [mockPngDetector]; + const parser = new FileTypeParser({customDetectors}); + + const result = await parser.fromBlob(blob); + t.deepEqual(result, {ext: 'mockPng', mime: 'image/mockPng'}); +}); + +test('fileTypeFromBuffer should detect custom file type "unicorn" using custom detectors', async t => { + const header = 'UNICORN FILE\n'; + const uint8ArrayContent = new TextEncoder().encode(header); + + const customDetectors = [unicornDetector]; + const parser = new FileTypeParser({customDetectors}); + + const result = await parser.fromBuffer(uint8ArrayContent); + t.deepEqual(result, {ext: 'unicorn', mime: 'application/unicorn'}); +}); + +test('fileTypeFromBuffer should keep detecting default file types when no custom detector matches', async t => { + const file = path.join(__dirname, 'fixture', 'fixture.png'); + const uint8ArrayContent = fs.readFileSync(file); + + const customDetectors = [unicornDetector]; + const parser = new FileTypeParser({customDetectors}); + + const result = await parser.fromBuffer(uint8ArrayContent); + t.deepEqual(result, {ext: 'png', mime: 'image/png'}); +}); + +test('fileTypeFromBuffer should allow overriding default file type detectors', async t => { + const file = path.join(__dirname, 'fixture', 'fixture.png'); + const uint8ArrayContent = fs.readFileSync(file); + + const customDetectors = [mockPngDetector]; + const parser = new FileTypeParser({customDetectors}); + + const result = await parser.fromBuffer(uint8ArrayContent); + t.deepEqual(result, {ext: 'mockPng', mime: 'image/mockPng'}); +}); + +class CustomReadableStream extends stream.Readable { + _read(_size) { + this.push('UNICORN'); + } +} +test('fileTypeFromStream should detect custom file type "unicorn" using custom detectors', async t => { + const readableStream = new CustomReadableStream(); + + const customDetectors = [unicornDetector]; + const parser = new FileTypeParser({customDetectors}); + + const result = await parser.fromStream(readableStream); + t.deepEqual(result, {ext: 'unicorn', mime: 'application/unicorn'}); +}); + +test('fileTypeFromStream should keep detecting default file types when no custom detector matches', async t => { + const file = path.join(__dirname, 'fixture', 'fixture.png'); + const readableStream = fs.createReadStream(file); + + const customDetectors = [unicornDetector]; + const parser = new FileTypeParser({customDetectors}); + + const result = await parser.fromStream(readableStream); + t.deepEqual(result, {ext: 'png', mime: 'image/png'}); +}); + +test('fileTypeFromStream should allow overriding default file type detectors', async t => { + const file = path.join(__dirname, 'fixture', 'fixture.png'); + const readableStream = fs.createReadStream(file); + + const customDetectors = [mockPngDetector]; + const parser = new FileTypeParser({customDetectors}); + + const result = await parser.fromStream(readableStream); + t.deepEqual(result, {ext: 'mockPng', mime: 'image/mockPng'}); +}); + +test('fileTypeFromFile should detect custom file type "unicorn" using custom detectors', async t => { + const file = path.join(__dirname, 'fixture', 'fixture.unicorn'); + + const customDetectors = [unicornDetector]; + + const result = await fileTypeFromFile(file, {customDetectors}); + t.deepEqual(result, {ext: 'unicorn', mime: 'application/unicorn'}); +}); + +test('fileTypeFromFile should keep detecting default file types when no custom detector matches', async t => { + const file = path.join(__dirname, 'fixture', 'fixture.png'); + + const customDetectors = [unicornDetector]; + + const result = await fileTypeFromFile(file, {customDetectors}); + t.deepEqual(result, {ext: 'png', mime: 'image/png'}); +}); + +test('fileTypeFromFile should allow overriding default file type detectors', async t => { + const file = path.join(__dirname, 'fixture', 'fixture.png'); + + const customDetectors = [mockPngDetector]; + + const result = await fileTypeFromFile(file, {customDetectors}); + t.deepEqual(result, {ext: 'mockPng', mime: 'image/mockPng'}); +}); + +test('fileTypeFromTokenizer should return undefined when a custom detector changes the tokenizer position and does not return a file type', async t => { + const header = 'UNICORN FILE\n'; + const uint8ArrayContent = new TextEncoder().encode(header); + + // Include the unicormDetector here to verify it's not used after the tokenizer.position changed + const customDetectors = [tokenizerPositionChanger, unicornDetector]; + const parser = new FileTypeParser({customDetectors}); + + const result = await parser.fromTokenizer(strtok3.fromBuffer(uint8ArrayContent)); + t.is(result, undefined); +});