Separate safe from unsafe detectors, introduce detector identification

Borewit · Borewit · commit 793ef3385cb9 · 2025-01-02T17:52:19.000+01:00
Extend the `Detector` type with an `id` property, which allows users to understand which detector is doing what.
The core detectors are seperated in:
Extend the `Detector` type with an `id` property, which allows users to understand which detector is doing what. The core detectors are seperated in:
- `default.safe`: Safe core detectors
- `default.unsafe`: Collection of detectors with higher probability of false positive
diff --git a/core.d.ts b/core.d.ts
@@ -162,7 +162,10 @@ console.log(fileType); // {ext: 'unicorn', mime: 'application/unicorn'}
 @param fileType - The file type detected by standard or previous custom detectors, or `undefined` if no match is found.
 @returns The detected file type, or `undefined` if no match is found.
 */
-export type Detector = (tokenizer: ITokenizer, fileType?: FileTypeResult) => Promise<FileTypeResult | undefined>;
+export type Detector = {
+	id: string;
+	detect: (tokenizer: ITokenizer, fileType?: FileTypeResult) => Promise<FileTypeResult | undefined>;
+};
 
 export type FileTypeOptions = {
 	customDetectors?: Iterable<Detector>;
diff --git a/core.js b/core.js
@@ -129,7 +129,9 @@ export async function fileTypeStream(webStream, options) {
 
 export class FileTypeParser {
 	constructor(options) {
-		this.detectors = [...(options?.customDetectors ?? []), this.parse];
+		this.detectors = [...(options?.customDetectors ?? []),
+			{id: 'core.safe', detect: this.detectCore},
+			{id: 'core.unsafe', detect: this.detectUnsafe}];
 		this.tokenizerOptions = {
 			abortSignal: options?.signal,
 		};
@@ -140,7 +142,7 @@ export class FileTypeParser {
 
 		// Iterate through all file-type detectors
 		for (const detector of this.detectors) {
-			const fileType = await detector(tokenizer);
+			const fileType = await detector.detect(tokenizer);
 			if (fileType) {
 				return fileType;
 			}
@@ -231,7 +233,7 @@ export class FileTypeParser {
 		return this.check(stringToBytes(header), options);
 	}
 
-	parse = async tokenizer => {
+	detectCore = async tokenizer => {
 		this.buffer = new Uint8Array(reasonableDetectionSizeInBytes);
 
 		// Keep reading until EOF if the file size is unknown.
@@ -321,7 +323,7 @@ export class FileTypeParser {
 		if (this.check([0xEF, 0xBB, 0xBF])) { // UTF-8-BOM
 			// Strip off UTF-8-BOM
 			this.tokenizer.ignore(3);
-			return this.parse(tokenizer);
+			return this.detectCore(tokenizer);
 		}
 
 		if (this.check([0x47, 0x49, 0x46])) {
@@ -1381,39 +1383,6 @@ export class FileTypeParser {
 			return undefined; // Some unknown text based format
 		}
 
-		// -- Unsafe signatures --
-
-		if (
-			this.check([0x0, 0x0, 0x1, 0xBA])
-			|| this.check([0x0, 0x0, 0x1, 0xB3])
-		) {
-			return {
-				ext: 'mpg',
-				mime: 'video/mpeg',
-			};
-		}
-
-		if (this.check([0x00, 0x01, 0x00, 0x00, 0x00])) {
-			return {
-				ext: 'ttf',
-				mime: 'font/ttf',
-			};
-		}
-
-		if (this.check([0x00, 0x00, 0x01, 0x00])) {
-			return {
-				ext: 'ico',
-				mime: 'image/x-icon',
-			};
-		}
-
-		if (this.check([0x00, 0x00, 0x02, 0x00])) {
-			return {
-				ext: 'cur',
-				mime: 'image/x-icon',
-			};
-		}
-
 		if (this.check([0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1])) {
 			// Detected Microsoft Compound File Binary File (MS-CFB) Format.
 			return {
@@ -1619,6 +1588,44 @@ export class FileTypeParser {
 				mime: 'application/pgp-encrypted',
 			};
 		}
+	};
+
+	detectUnsafe = async tokenizer => {
+		this.buffer = new Uint8Array(reasonableDetectionSizeInBytes);
+
+		// Read initial sample size of 8 bytes
+		await tokenizer.peekBuffer(this.buffer, {length: Math.min(8, tokenizer.fileInfo.size), mayBeLess: true});
+
+		if (
+			this.check([0x0, 0x0, 0x1, 0xBA])
+			|| this.check([0x0, 0x0, 0x1, 0xB3])
+		) {
+			return {
+				ext: 'mpg',
+				mime: 'video/mpeg',
+			};
+		}
+
+		if (this.check([0x00, 0x01, 0x00, 0x00, 0x00])) {
+			return {
+				ext: 'ttf',
+				mime: 'font/ttf',
+			};
+		}
+
+		if (this.check([0x00, 0x00, 0x01, 0x00])) {
+			return {
+				ext: 'ico',
+				mime: 'image/x-icon',
+			};
+		}
+
+		if (this.check([0x00, 0x00, 0x02, 0x00])) {
+			return {
+				ext: 'cur',
+				mime: 'image/x-icon',
+			};
+		}
 
 		// Check MPEG 1 or 2 Layer 3 header, or 'layer 0' for ADTS (MPEG sync-word 0xFFE)
 		if (this.buffer.length >= 2 && this.check([0xFF, 0xE0], {offset: 0, mask: [0xFF, 0xE0]})) {
diff --git a/readme.md b/readme.md
@@ -364,8 +364,9 @@ Below is an example of a custom detector array. This can be passed to the `FileT
 ```js
 import {FileTypeParser} from 'file-type';
 
-const customDetectors = [
-	async tokenizer => {
+const unicornDetector = {
+	id: 'unicorn',
+  	async detect(tokenizer) {
 		const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" in ASCII decimal
 
 		const buffer = new Uint8Array(unicornHeader.length);
@@ -375,11 +376,11 @@ const customDetectors = [
 		}
 
 		return undefined;
-	},
-];
+	}
+}
 
 const buffer = new Uint8Array([85, 78, 73, 67, 79, 82, 78]);
-const parser = new FileTypeParser({customDetectors});
+const parser = new FileTypeParser({customDetectors: [unicornDetector]});
 const fileType = await parser.fromBuffer(buffer);
 console.log(fileType); // {ext: 'unicorn', mime: 'application/unicorn'}
 ```
diff --git a/test.js b/test.js
@@ -688,22 +688,31 @@ test('corrupt MKV throws', async t => {
 });
 
 // Create a custom detector for the just made up "unicorn" file type
-const unicornDetector = async tokenizer => {
-	const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string
-	const buffer = new Uint8Array(7);
-	await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true});
-	if (unicornHeader.every((value, index) => value === buffer[index])) {
-		return {ext: 'unicorn', mime: 'application/unicorn'};
-	}
+const unicornDetector = {
+	id: 'mock.unicorn',
+	async detect(tokenizer) {
+		const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string
+		const buffer = new Uint8Array(7);
+		await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true});
+		if (unicornHeader.every((value, index) => value === buffer[index])) {
+			return {ext: 'unicorn', mime: 'application/unicorn'};
+		}
 
-	return undefined;
+		return undefined;
+	},
 };
 
-const mockPngDetector = _tokenizer => ({ext: 'mockPng', mime: 'image/mockPng'});
+const mockPngDetector = {
+	id: 'mock.png',
+	detect: () => ({ext: 'mockPng', mime: 'image/mockPng'}),
+};
 
-const tokenizerPositionChanger = tokenizer => {
-	const buffer = new Uint8Array(1);
-	tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true});
+const tokenizerPositionChanger = {
+	id: 'mock.dirtyTokenizer',
+	detect(tokenizer) {
+		const buffer = new Uint8Array(1);
+		tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true});
+	},
 };
 
 if (nodeMajorVersion >= nodeVersionSupportingByteBlobStream) {