Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Separate confident and imprecise detectors, introduce detector identification #717

Merged
merged 4 commits into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/pull_request_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ If you're adding support for a new file type, please follow the below steps:
- Add a fixture file named `fixture.<extension>` to the `fixture` directory.
- Add the file extension to the `extensions` array in `supported.js`.
- Add the file's MIME type to the `types` array in `supported.js`.
- Add the file type detection logic to the `core.js` file
- Add the file type detection logic to the `core.js` file.
- Determine the appropriate detection confidence category:
- `detectConfident()`: Detections with a high degree of certainty in identifying the correct file type.
- `detectImprecise()`: Detections with limited supporting data, resulting in a higher likelihood of false positives.
- Respect the sequence:
- Signature with shorter sample size (counted from offset 0 until the last required byte position) will be executed first.
- Only the initial determination for the file type counts for the sequence.
Expand Down
5 changes: 4 additions & 1 deletion core.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,10 @@ console.log(fileType); // {ext: 'unicorn', mime: 'application/unicorn'}
@param fileType - The file type detected by standard or previous custom detectors, or `undefined` if no match is found.
@returns The detected file type, or `undefined` if no match is found.
*/
export type Detector = (tokenizer: ITokenizer, fileType?: FileTypeResult) => Promise<FileTypeResult | undefined>;
export type Detector = {
id: string;
detect: (tokenizer: ITokenizer, fileType?: FileTypeResult) => Promise<FileTypeResult | undefined>;
};

export type FileTypeOptions = {
customDetectors?: Iterable<Detector>;
Expand Down
83 changes: 46 additions & 37 deletions core.js
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,9 @@ export async function fileTypeStream(webStream, options) {

export class FileTypeParser {
constructor(options) {
this.detectors = [...(options?.customDetectors ?? []), this.parse];
this.detectors = [...(options?.customDetectors ?? []),
{id: 'core', detect: this.detectConfident},
{id: 'core.imprecise', detect: this.detectImprecise}];
this.tokenizerOptions = {
abortSignal: options?.signal,
};
Expand All @@ -140,7 +142,7 @@ export class FileTypeParser {

// Iterate through all file-type detectors
for (const detector of this.detectors) {
const fileType = await detector(tokenizer);
const fileType = await detector.detect(tokenizer);
if (fileType) {
return fileType;
}
Expand Down Expand Up @@ -231,7 +233,8 @@ export class FileTypeParser {
return this.check(stringToBytes(header), options);
}

parse = async tokenizer => {
// Detections with a high degree of certainty in identifying the correct file type
detectConfident = async tokenizer => {
this.buffer = new Uint8Array(reasonableDetectionSizeInBytes);

// Keep reading until EOF if the file size is unknown.
Expand Down Expand Up @@ -321,7 +324,7 @@ export class FileTypeParser {
if (this.check([0xEF, 0xBB, 0xBF])) { // UTF-8-BOM
// Strip off UTF-8-BOM
this.tokenizer.ignore(3);
return this.parse(tokenizer);
return this.detectConfident(tokenizer);
}

if (this.check([0x47, 0x49, 0x46])) {
Expand Down Expand Up @@ -1381,39 +1384,6 @@ export class FileTypeParser {
return undefined; // Some unknown text based format
}

// -- Unsafe signatures --

if (
this.check([0x0, 0x0, 0x1, 0xBA])
|| this.check([0x0, 0x0, 0x1, 0xB3])
) {
return {
ext: 'mpg',
mime: 'video/mpeg',
};
}

if (this.check([0x00, 0x01, 0x00, 0x00, 0x00])) {
return {
ext: 'ttf',
mime: 'font/ttf',
};
}

if (this.check([0x00, 0x00, 0x01, 0x00])) {
return {
ext: 'ico',
mime: 'image/x-icon',
};
}

if (this.check([0x00, 0x00, 0x02, 0x00])) {
return {
ext: 'cur',
mime: 'image/x-icon',
};
}

if (this.check([0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1])) {
// Detected Microsoft Compound File Binary File (MS-CFB) Format.
return {
Expand Down Expand Up @@ -1619,6 +1589,45 @@ export class FileTypeParser {
mime: 'application/pgp-encrypted',
};
}
};

// Detections with limited supporting data, resulting in a higher likelihood of false positives
detectImprecise = async tokenizer => {
this.buffer = new Uint8Array(reasonableDetectionSizeInBytes);

// Read initial sample size of 8 bytes
await tokenizer.peekBuffer(this.buffer, {length: Math.min(8, tokenizer.fileInfo.size), mayBeLess: true});

if (
this.check([0x0, 0x0, 0x1, 0xBA])
|| this.check([0x0, 0x0, 0x1, 0xB3])
) {
return {
ext: 'mpg',
mime: 'video/mpeg',
};
}

if (this.check([0x00, 0x01, 0x00, 0x00, 0x00])) {
return {
ext: 'ttf',
mime: 'font/ttf',
};
}

if (this.check([0x00, 0x00, 0x01, 0x00])) {
return {
ext: 'ico',
mime: 'image/x-icon',
};
}

if (this.check([0x00, 0x00, 0x02, 0x00])) {
return {
ext: 'cur',
mime: 'image/x-icon',
};
}

// Check MPEG 1 or 2 Layer 3 header, or 'layer 0' for ADTS (MPEG sync-word 0xFFE)
if (this.buffer.length >= 2 && this.check([0xFF, 0xE0], {offset: 0, mask: [0xFF, 0xE0]})) {
Expand Down
11 changes: 6 additions & 5 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -364,8 +364,9 @@ Below is an example of a custom detector array. This can be passed to the `FileT
```js
import {FileTypeParser} from 'file-type';

const customDetectors = [
async tokenizer => {
const unicornDetector = {
id: 'unicorn', // May be used to recognize the detector in the detector list
async detect(tokenizer) {
const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" in ASCII decimal

const buffer = new Uint8Array(unicornHeader.length);
Expand All @@ -375,11 +376,11 @@ const customDetectors = [
}

return undefined;
},
];
}
}

const buffer = new Uint8Array([85, 78, 73, 67, 79, 82, 78]);
const parser = new FileTypeParser({customDetectors});
const parser = new FileTypeParser({customDetectors: [unicornDetector]});
const fileType = await parser.fromBuffer(buffer);
console.log(fileType); // {ext: 'unicorn', mime: 'application/unicorn'}
```
Expand Down
33 changes: 21 additions & 12 deletions test.js
Original file line number Diff line number Diff line change
Expand Up @@ -688,22 +688,31 @@ test('corrupt MKV throws', async t => {
});

// Create a custom detector for the just made up "unicorn" file type
const unicornDetector = async tokenizer => {
const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string
const buffer = new Uint8Array(7);
await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true});
if (unicornHeader.every((value, index) => value === buffer[index])) {
return {ext: 'unicorn', mime: 'application/unicorn'};
}
const unicornDetector = {
id: 'mock.unicorn',
async detect(tokenizer) {
const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string
const buffer = new Uint8Array(7);
await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true});
if (unicornHeader.every((value, index) => value === buffer[index])) {
return {ext: 'unicorn', mime: 'application/unicorn'};
}

return undefined;
return undefined;
},
};

const mockPngDetector = _tokenizer => ({ext: 'mockPng', mime: 'image/mockPng'});
const mockPngDetector = {
id: 'mock.png',
detect: () => ({ext: 'mockPng', mime: 'image/mockPng'}),
};

const tokenizerPositionChanger = tokenizer => {
const buffer = new Uint8Array(1);
tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true});
const tokenizerPositionChanger = {
id: 'mock.dirtyTokenizer',
detect(tokenizer) {
const buffer = new Uint8Array(1);
tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true});
},
};

if (nodeMajorVersion >= nodeVersionSupportingByteBlobStream) {
Expand Down
Loading