Skip to content

Commit 356bce8

Browse files
Separate confident and imprecise detectors, introduce detector identification (#717)
Co-authored-by: Sindre Sorhus <[email protected]>
1 parent 4db407d commit 356bce8

File tree

5 files changed

+81
-56
lines changed

5 files changed

+81
-56
lines changed

.github/pull_request_template.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@ If you're adding support for a new file type, please follow the below steps:
44
- Add a fixture file named `fixture.<extension>` to the `fixture` directory.
55
- Add the file extension to the `extensions` array in `supported.js`.
66
- Add the file's MIME type to the `types` array in `supported.js`.
7-
- Add the file type detection logic to the `core.js` file
7+
- Add the file type detection logic to the `core.js` file.
8+
- Determine the appropriate detection confidence category:
9+
- `detectConfident()`: Detections with a high degree of certainty in identifying the correct file type.
10+
- `detectImprecise()`: Detections with limited supporting data, resulting in a higher likelihood of false positives.
811
- Respect the sequence:
912
- Signature with shorter sample size (counted from offset 0 until the last required byte position) will be executed first.
1013
- Only the initial determination for the file type counts for the sequence.

core.d.ts

+4-1
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,10 @@ console.log(fileType); // {ext: 'unicorn', mime: 'application/unicorn'}
162162
@param fileType - The file type detected by standard or previous custom detectors, or `undefined` if no match is found.
163163
@returns The detected file type, or `undefined` if no match is found.
164164
*/
165-
export type Detector = (tokenizer: ITokenizer, fileType?: FileTypeResult) => Promise<FileTypeResult | undefined>;
165+
export type Detector = {
166+
id: string;
167+
detect: (tokenizer: ITokenizer, fileType?: FileTypeResult) => Promise<FileTypeResult | undefined>;
168+
};
166169

167170
export type FileTypeOptions = {
168171
customDetectors?: Iterable<Detector>;

core.js

+46-37
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,9 @@ export async function fileTypeStream(webStream, options) {
154154

155155
export class FileTypeParser {
156156
constructor(options) {
157-
this.detectors = [...(options?.customDetectors ?? []), this.parse];
157+
this.detectors = [...(options?.customDetectors ?? []),
158+
{id: 'core', detect: this.detectConfident},
159+
{id: 'core.imprecise', detect: this.detectImprecise}];
158160
this.tokenizerOptions = {
159161
abortSignal: options?.signal,
160162
};
@@ -165,7 +167,7 @@ export class FileTypeParser {
165167

166168
// Iterate through all file-type detectors
167169
for (const detector of this.detectors) {
168-
const fileType = await detector(tokenizer);
170+
const fileType = await detector.detect(tokenizer);
169171
if (fileType) {
170172
return fileType;
171173
}
@@ -256,7 +258,8 @@ export class FileTypeParser {
256258
return this.check(stringToBytes(header), options);
257259
}
258260

259-
parse = async tokenizer => {
261+
// Detections with a high degree of certainty in identifying the correct file type
262+
detectConfident = async tokenizer => {
260263
this.buffer = new Uint8Array(reasonableDetectionSizeInBytes);
261264

262265
// Keep reading until EOF if the file size is unknown.
@@ -346,7 +349,7 @@ export class FileTypeParser {
346349
if (this.check([0xEF, 0xBB, 0xBF])) { // UTF-8-BOM
347350
// Strip off UTF-8-BOM
348351
this.tokenizer.ignore(3);
349-
return this.parse(tokenizer);
352+
return this.detectConfident(tokenizer);
350353
}
351354

352355
if (this.check([0x47, 0x49, 0x46])) {
@@ -1406,39 +1409,6 @@ export class FileTypeParser {
14061409
return undefined; // Some unknown text based format
14071410
}
14081411

1409-
// -- Unsafe signatures --
1410-
1411-
if (
1412-
this.check([0x0, 0x0, 0x1, 0xBA])
1413-
|| this.check([0x0, 0x0, 0x1, 0xB3])
1414-
) {
1415-
return {
1416-
ext: 'mpg',
1417-
mime: 'video/mpeg',
1418-
};
1419-
}
1420-
1421-
if (this.check([0x00, 0x01, 0x00, 0x00, 0x00])) {
1422-
return {
1423-
ext: 'ttf',
1424-
mime: 'font/ttf',
1425-
};
1426-
}
1427-
1428-
if (this.check([0x00, 0x00, 0x01, 0x00])) {
1429-
return {
1430-
ext: 'ico',
1431-
mime: 'image/x-icon',
1432-
};
1433-
}
1434-
1435-
if (this.check([0x00, 0x00, 0x02, 0x00])) {
1436-
return {
1437-
ext: 'cur',
1438-
mime: 'image/x-icon',
1439-
};
1440-
}
1441-
14421412
if (this.check([0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1])) {
14431413
// Detected Microsoft Compound File Binary File (MS-CFB) Format.
14441414
return {
@@ -1644,6 +1614,45 @@ export class FileTypeParser {
16441614
mime: 'application/pgp-encrypted',
16451615
};
16461616
}
1617+
};
1618+
1619+
// Detections with limited supporting data, resulting in a higher likelihood of false positives
1620+
detectImprecise = async tokenizer => {
1621+
this.buffer = new Uint8Array(reasonableDetectionSizeInBytes);
1622+
1623+
// Read initial sample size of 8 bytes
1624+
await tokenizer.peekBuffer(this.buffer, {length: Math.min(8, tokenizer.fileInfo.size), mayBeLess: true});
1625+
1626+
if (
1627+
this.check([0x0, 0x0, 0x1, 0xBA])
1628+
|| this.check([0x0, 0x0, 0x1, 0xB3])
1629+
) {
1630+
return {
1631+
ext: 'mpg',
1632+
mime: 'video/mpeg',
1633+
};
1634+
}
1635+
1636+
if (this.check([0x00, 0x01, 0x00, 0x00, 0x00])) {
1637+
return {
1638+
ext: 'ttf',
1639+
mime: 'font/ttf',
1640+
};
1641+
}
1642+
1643+
if (this.check([0x00, 0x00, 0x01, 0x00])) {
1644+
return {
1645+
ext: 'ico',
1646+
mime: 'image/x-icon',
1647+
};
1648+
}
1649+
1650+
if (this.check([0x00, 0x00, 0x02, 0x00])) {
1651+
return {
1652+
ext: 'cur',
1653+
mime: 'image/x-icon',
1654+
};
1655+
}
16471656

16481657
// Check MPEG 1 or 2 Layer 3 header, or 'layer 0' for ADTS (MPEG sync-word 0xFFE)
16491658
if (this.buffer.length >= 2 && this.check([0xFF, 0xE0], {offset: 0, mask: [0xFF, 0xE0]})) {

readme.md

+6-5
Original file line numberDiff line numberDiff line change
@@ -364,8 +364,9 @@ Below is an example of a custom detector array. This can be passed to the `FileT
364364
```js
365365
import {FileTypeParser} from 'file-type';
366366

367-
const customDetectors = [
368-
async tokenizer => {
367+
const unicornDetector = {
368+
id: 'unicorn', // May be used to recognize the detector in the detector list
369+
async detect(tokenizer) {
369370
const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" in ASCII decimal
370371

371372
const buffer = new Uint8Array(unicornHeader.length);
@@ -375,11 +376,11 @@ const customDetectors = [
375376
}
376377

377378
return undefined;
378-
},
379-
];
379+
}
380+
}
380381

381382
const buffer = new Uint8Array([85, 78, 73, 67, 79, 82, 78]);
382-
const parser = new FileTypeParser({customDetectors});
383+
const parser = new FileTypeParser({customDetectors: [unicornDetector]});
383384
const fileType = await parser.fromBuffer(buffer);
384385
console.log(fileType); // {ext: 'unicorn', mime: 'application/unicorn'}
385386
```

test.js

+21-12
Original file line numberDiff line numberDiff line change
@@ -688,22 +688,31 @@ test('corrupt MKV throws', async t => {
688688
});
689689

690690
// Create a custom detector for the just made up "unicorn" file type
691-
const unicornDetector = async tokenizer => {
692-
const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string
693-
const buffer = new Uint8Array(7);
694-
await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true});
695-
if (unicornHeader.every((value, index) => value === buffer[index])) {
696-
return {ext: 'unicorn', mime: 'application/unicorn'};
697-
}
691+
const unicornDetector = {
692+
id: 'mock.unicorn',
693+
async detect(tokenizer) {
694+
const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string
695+
const buffer = new Uint8Array(7);
696+
await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true});
697+
if (unicornHeader.every((value, index) => value === buffer[index])) {
698+
return {ext: 'unicorn', mime: 'application/unicorn'};
699+
}
698700

699-
return undefined;
701+
return undefined;
702+
},
700703
};
701704

702-
const mockPngDetector = _tokenizer => ({ext: 'mockPng', mime: 'image/mockPng'});
705+
const mockPngDetector = {
706+
id: 'mock.png',
707+
detect: () => ({ext: 'mockPng', mime: 'image/mockPng'}),
708+
};
703709

704-
const tokenizerPositionChanger = tokenizer => {
705-
const buffer = new Uint8Array(1);
706-
tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true});
710+
const tokenizerPositionChanger = {
711+
id: 'mock.dirtyTokenizer',
712+
detect(tokenizer) {
713+
const buffer = new Uint8Array(1);
714+
tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true});
715+
},
707716
};
708717

709718
if (nodeMajorVersion >= nodeVersionSupportingByteBlobStream) {

0 commit comments

Comments
 (0)