Support adding custom detectors (#603)

Co-authored-by: Sindre Sorhus <sindresorhus@gmail.com> Co-authored-by: Borewit <Borewit@users.noreply.github.com>
sindresorhus · Nov 10, 2023 · f5b232c · f5b232c
1 parent b272572
commit f5b232c
Show file tree

Hide file tree

Showing 6 changed files with 415 additions and 64 deletions.
diff --git a/core.d.ts b/core.d.ts
@@ -421,7 +421,10 @@ if (stream2.fileType?.mime === 'image/jpeg') {
 export function fileTypeStream(readableStream: ReadableStream, options?: StreamOptions): Promise<ReadableStreamWithFileType>;
 
 /**
-Detect the file type of a [`Blob`](https://nodejs.org/api/buffer.html#class-blob).
+Detect the file type of a [`Blob`](https://nodejs.org/api/buffer.html#class-blob) or [`File`](https://developer.mozilla.org/en-US/docs/Web/API/File).
+
+@param blob [`Blob`](https://nodejs.org/api/buffer.html#class-blob) used for file detection
+@returns The detected file type and MIME type, or `undefined` when there is no match.
 
 @example
 ```
@@ -437,3 +440,88 @@ console.log(await fileTypeFromBlob(blob));
 ```
 */
 export declare function fileTypeFromBlob(blob: Blob): Promise<FileTypeResult | undefined>;
+
+/**
+Function that allows specifying custom detection mechanisms.
+
+An iterable of detectors can be provided via the `fileTypeOptions` argument for the {@link FileTypeParser.constructor}.
+
+The detectors are called before the default detections in the provided order.
+
+Custom detectors can be used to add new FileTypeResults or to modify return behaviour of existing FileTypeResult detections.
+
+If the detector returns `undefined`, there are 2 possible scenarios:
+
+	1. The detector has not read from the tokenizer, it will be proceeded with the next available detector.
+	2. The detector has read from the tokenizer (`tokenizer.position` has been increased).
+		In that case no further detectors will be executed and the final conclusion is that file-type returns undefined.
+		Note that this an exceptional scenario, as the detector takes the opportunity from any other detector to determine the file type.
+
+Example detector array which can be extended and provided via the fileTypeOptions argument:
+
+```
+import {FileTypeParser} from 'file-type';
+
+const customDetectors = [
+	async tokenizer => {
+		const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string
+		const buffer = Buffer.alloc(7);
+		await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true});
+		if (unicornHeader.every((value, index) => value === buffer[index])) {
+			return {ext: 'unicorn', mime: 'application/unicorn'};
+		}
+
+		return undefined;
+	},
+];
+
+const buffer = Buffer.from("UNICORN");
+const parser = new FileTypeParser({customDetectors});
+const fileType = await parser.fromBuffer(buffer);
+console.log(fileType);
+```
+
+@param tokenizer - [Tokenizer](https://github.com/Borewit/strtok3#tokenizer), used to read the file content from.
+@param fileType - FileTypeResult detected by the standard detections or a previous custom detection. Undefined if no matching fileTypeResult could be found.
+@returns supposedly detected file extension and MIME type as a FileTypeResult-like object, or `undefined` when there is no match.
+*/
+export type Detector = (tokenizer: ITokenizer, fileType?: FileTypeResult) => Promise<FileTypeResult | undefined>;
+
+export type FileTypeOptions = {
+	customDetectors?: Iterable<Detector>;
+};
+
+export declare class TokenizerPositionError extends Error {
+	constructor(message?: string);
+}
+
+export declare class FileTypeParser {
+	detectors: Iterable<Detector>;
+
+	constructor(options?: {customDetectors?: Iterable<Detector>});
+
+	/**
+	Works the same way as {@link fileTypeFromBuffer}, additionally taking into account custom detectors (if any were provided to the constructor).
+	*/
+	fromBuffer(buffer: Uint8Array | ArrayBuffer): Promise<FileTypeResult | undefined>;
+
+	/**
+	Works the same way as {@link fileTypeFromStream}, additionally taking into account custom detectors (if any were provided to the constructor).
+	*/
+	fromStream(stream: ReadableStream): Promise<FileTypeResult | undefined>;
+
+	/**
+	Works the same way as {@link fileTypeFromTokenizer}, additionally taking into account custom detectors (if any were provided to the constructor).
+	*/
+	fromTokenizer(tokenizer: ITokenizer): Promise<FileTypeResult | undefined>;
+
+	/**
+	Works the same way as {@link fileTypeFromBlob}, additionally taking into account custom detectors (if any were provided to the constructor).
+	*/
+	fromBlob(blob: Blob): Promise<FileTypeResult | undefined>;
+
+	/**
+	Works the same way as {@link fileTypeStream}, additionally taking into account custom detectors (if any were provided to the constructor).
+	*/
+	toDetectionStream(readableStream: ReadableStream, options?: StreamOptions): Promise<FileTypeResult | undefined>;
+}
diff --git a/core.js b/core.js
@@ -11,31 +11,15 @@ import {extensions, mimeTypes} from './supported.js';
 const minimumBytes = 4100; // A fair amount of file-types are detectable within this range.
 
 export async function fileTypeFromStream(stream) {
-	const tokenizer = await strtok3.fromStream(stream);
-	try {
-		return await fileTypeFromTokenizer(tokenizer);
-	} finally {
-		await tokenizer.close();
-	}
+	return new FileTypeParser().fromStream(stream);
 }
 
 export async function fileTypeFromBuffer(input) {
-	if (!(input instanceof Uint8Array || input instanceof ArrayBuffer)) {
-		throw new TypeError(`Expected the \`input\` argument to be of type \`Uint8Array\` or \`Buffer\` or \`ArrayBuffer\`, got \`${typeof input}\``);
-	}
-
-	const buffer = input instanceof Uint8Array ? input : new Uint8Array(input);
-
-	if (!(buffer?.length > 1)) {
-		return;
-	}
-
-	return fileTypeFromTokenizer(strtok3.fromBuffer(buffer));
+	return new FileTypeParser().fromBuffer(input);
 }
 
 export async function fileTypeFromBlob(blob) {
-	const buffer = await blob.arrayBuffer();
-	return fileTypeFromBuffer(new Uint8Array(buffer));
+	return new FileTypeParser().fromBlob(blob);
 }
 
 function _check(buffer, headers, options) {
@@ -60,16 +44,98 @@ function _check(buffer, headers, options) {
 }
 
 export async function fileTypeFromTokenizer(tokenizer) {
-	try {
-		return new FileTypeParser().parse(tokenizer);
-	} catch (error) {
-		if (!(error instanceof strtok3.EndOfStreamError)) {
-			throw error;
+	return new FileTypeParser().fromTokenizer(tokenizer);
+}
+
+export class FileTypeParser {
+	constructor(options) {
+		this.detectors = options?.customDetectors;
+
+		this.fromTokenizer = this.fromTokenizer.bind(this);
+		this.fromBuffer = this.fromBuffer.bind(this);
+		this.parse = this.parse.bind(this);
+	}
+
+	async fromTokenizer(tokenizer) {
+		const initialPosition = tokenizer.position;
+
+		for (const detector of this.detectors || []) {
+			const fileType = await detector(tokenizer);
+			if (fileType) {
+				return fileType;
+			}
+
+			if (initialPosition !== tokenizer.position) {
+				return undefined; // Cannot proceed scanning of the tokenizer is at an arbitrary position
+			}
 		}
+
+		return this.parse(tokenizer);
+	}
+
+	async fromBuffer(input) {
+		if (!(input instanceof Uint8Array || input instanceof ArrayBuffer)) {
+			throw new TypeError(`Expected the \`input\` argument to be of type \`Uint8Array\` or \`Buffer\` or \`ArrayBuffer\`, got \`${typeof input}\``);
+		}
+
+		const buffer = input instanceof Uint8Array ? input : new Uint8Array(input);
+
+		if (!(buffer?.length > 1)) {
+			return;
+		}
+
+		return this.fromTokenizer(strtok3.fromBuffer(buffer));
+	}
+
+	async fromBlob(blob) {
+		const buffer = await blob.arrayBuffer();
+		return this.fromBuffer(new Uint8Array(buffer));
+	}
+
+	async fromStream(stream) {
+		const tokenizer = await strtok3.fromStream(stream);
+		try {
+			return await this.fromTokenizer(tokenizer);
+		} finally {
+			await tokenizer.close();
+		}
+	}
+
+	async toDetectionStream(readableStream, options = {}) {
+		const {default: stream} = await import('node:stream');
+		const {sampleSize = minimumBytes} = options;
+
+		return new Promise((resolve, reject) => {
+			readableStream.on('error', reject);
+
+			readableStream.once('readable', () => {
+				(async () => {
+					try {
+						// Set up output stream
+						const pass = new stream.PassThrough();
+						const outputStream = stream.pipeline ? stream.pipeline(readableStream, pass, () => {}) : readableStream.pipe(pass);
+
+						// Read the input stream and detect the filetype
+						const chunk = readableStream.read(sampleSize) ?? readableStream.read() ?? Buffer.alloc(0);
+						try {
+							pass.fileType = await this.fromBuffer(chunk);
+						} catch (error) {
+							if (error instanceof strtok3.EndOfStreamError) {
+								pass.fileType = undefined;
+							} else {
+								reject(error);
+							}
+						}
+
+						resolve(outputStream);
+					} catch (error) {
+						reject(error);
+					}
+				})();
+			});
+		});
 	}
-}
 
-class FileTypeParser {
 	check(header, options) {
 		return _check(this.buffer, header, options);
 	}
@@ -211,7 +277,7 @@ class FileTypeParser {
 			}
 
 			await tokenizer.ignore(id3HeaderLength);
-			return fileTypeFromTokenizer(tokenizer); // Skip ID3 header, recursion
+			return this.fromTokenizer(tokenizer); // Skip ID3 header, recursion
 		}
 
 		// Musepack, SV7
@@ -1609,39 +1675,8 @@ class FileTypeParser {
 	}
 }
 
-export async function fileTypeStream(readableStream, {sampleSize = minimumBytes} = {}) {
-	const {default: stream} = await import('node:stream');
-
-	return new Promise((resolve, reject) => {
-		readableStream.on('error', reject);
-
-		readableStream.once('readable', () => {
-			(async () => {
-				try {
-					// Set up output stream
-					const pass = new stream.PassThrough();
-					const outputStream = stream.pipeline ? stream.pipeline(readableStream, pass, () => {}) : readableStream.pipe(pass);
-
-					// Read the input stream and detect the filetype
-					const chunk = readableStream.read(sampleSize) ?? readableStream.read() ?? Buffer.alloc(0);
-					try {
-						const fileType = await fileTypeFromBuffer(chunk);
-						pass.fileType = fileType;
-					} catch (error) {
-						if (error instanceof strtok3.EndOfStreamError) {
-							pass.fileType = undefined;
-						} else {
-							reject(error);
-						}
-					}
-
-					resolve(outputStream);
-				} catch (error) {
-					reject(error);
-				}
-			})();
-		});
-	});
+export async function fileTypeStream(readableStream, options = {}) {
+	return new FileTypeParser().toDetectionStream(readableStream, options);
 }
 
 export const supportedExtensions = new Set(extensions);

diff --git a/fixture/fixture.unicorn b/fixture/fixture.unicorn
@@ -0,0 +1 @@
+UNICORN FILE CONTENT
diff --git a/index.js b/index.js
@@ -1,10 +1,11 @@
 import * as strtok3 from 'strtok3';
-import {fileTypeFromTokenizer} from './core.js';
+import {FileTypeParser} from './core.js';
 
-export async function fileTypeFromFile(path) {
+export async function fileTypeFromFile(path, fileTypeOptions) {
 	const tokenizer = await strtok3.fromFile(path);
 	try {
-		return await fileTypeFromTokenizer(tokenizer);
+		const parser = new FileTypeParser(fileTypeOptions);
+		return await parser.fromTokenizer(tokenizer);
 	} finally {
 		await tokenizer.close();
 	}

diff --git a/readme.md b/readme.md
@@ -189,6 +189,10 @@ console.log(await fileTypeFromBlob(blob));
 //=> {ext: 'txt', mime: 'plain/text'}
 ```
 
+#### blob
+
+Type: [`Blob`](https://developer.mozilla.org/en-US/docs/Web/API/Blob)
+
 ### fileTypeFromTokenizer(tokenizer)
 
 Detect the file type from an `ITokenizer` source.
@@ -305,6 +309,48 @@ Returns a `Set<string>` of supported file extensions.
 
 Returns a `Set<string>` of supported MIME types.
 
+## Custom detectors
+
+A custom detector is a function that allows specifying custom detection mechanisms.
+
+An iterable of detectors can be provided via the `fileTypeOptions` argument for the `FileTypeParser.constructor`.
+
+The detectors are called before the default detections in the provided order.
+
+Custom detectors can be used to add new `FileTypeResults` or to modify return behaviour of existing FileTypeResult detections.
+
+If the detector returns `undefined`, there are 2 possible scenarios:
+
+1. The detector has not read from the tokenizer, it will be proceeded with the next available detector.
+2. The detector has read from the tokenizer (`tokenizer.position` has been increased).
+		In that case no further detectors will be executed and the final conclusion is that file-type returns undefined.
+		Note that this an exceptional scenario, as the detector takes the opportunity from any other detector to determine the file type.
+
+
+Example detector array which can be extended and provided to each public method via the `fileTypeOptions` argument:
+```js
+import {FileTypeParser} from 'file-type';
+
+const customDetectors = [
+	async tokenizer => {
+		const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string
+		const buffer = Buffer.alloc(7);
+		await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true});
+
+		if (unicornHeader.every((value, index) => value === buffer[index])) {
+			return {ext: 'unicorn', mime: 'application/unicorn'};
+		}
+
+		return undefined;
+	},
+];
+
+const buffer = Buffer.from("UNICORN");
+const parser = new FileTypeParser({customDetectors});
+const fileType = await parser.fromBuffer(buffer);
+console.log(fileType);
+```
+
 ## Supported file types
 
 - [`3g2`](https://en.wikipedia.org/wiki/3GP_and_3G2#3G2) - Multimedia container format defined by the 3GPP2 for 3G CDMA2000 multimedia services
@@ -470,6 +516,20 @@ The following file types will not be accepted:
 - `.csv` - [Reason.](https://github.com/sindresorhus/file-type/issues/264#issuecomment-568439196)
 - `.svg` - Detecting it requires a full-blown parser. Check out [`is-svg`](https://github.com/sindresorhus/is-svg) for something that mostly works.
 
+#### tokenizer
+
+Type: [`ITokenizer`](https://github.com/Borewit/strtok3#tokenizer)
+
+Usable as source of the examined file.
+
+#### fileType
+
+Type: FileTypeResult
+
+Object having an `ext` (extension) and `mime` (mime type) property.
+
+Detected by the standard detections or a previous custom detection. Undefined if no matching fileTypeResult could be found.
+
 ## Related
 
 - [file-type-cli](https://github.com/sindresorhus/file-type-cli) - CLI for this module