Skip to content

Commit

Permalink
feat: add capability to autodetect if content header is ambiguous + w…
Browse files Browse the repository at this point in the history
…rap up esm transition (#1)
  • Loading branch information
ncpenke authored Apr 29, 2024
1 parent 6e1f487 commit a53e371
Show file tree
Hide file tree
Showing 7 changed files with 402 additions and 244 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
- uses: actions/checkout@v2

- name: Install dependencies
run: yarn
run: npm

- name: Run tests
uses: ospfranco/[email protected]
Expand Down
2 changes: 2 additions & 0 deletions __tests__/__snapshots__/index.spec.ts.snap
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@ exports[`#getLinkPreview() no link in text should fail gracefully 1`] = `"link-p
exports[`#getLinkPreview() should handle empty strings gracefully 1`] = `"link-preview-js did not receive a valid url or text"`;

exports[`#getLinkPreview() should handle malformed urls gracefully 1`] = `"link-preview-js did not receive a valid a url or text"`;

exports[`#getLinkPreview() should throw exception if URL is not valid 1`] = `"link-preview-js unexpected status in response 404 Not Found"`;
56 changes: 39 additions & 17 deletions __tests__/index.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import prefetchedResponse from "./sampleResponse.json";

describe(`#getLinkPreview()`, () => {
it(`should extract link info from just URL`, async () => {
const linkInfo: any = await getLinkPreview(
const linkInfo = await getLinkPreview(
`https://www.youtube.com/watch?v=wuClZjOdT30`,
{ headers: { "Accept-Language": `en-US` } },
);
Expand All @@ -13,13 +13,13 @@ describe(`#getLinkPreview()`, () => {
expect(linkInfo.title).toEqual(`Geography Now! Germany`);
expect(linkInfo.description).toBeTruthy();
expect(linkInfo.mediaType).toEqual(`video.other`);
expect(linkInfo.images.length).toEqual(1);
expect(linkInfo.images[0]).toEqual(
expect(linkInfo.images!.length).toEqual(1);
expect(linkInfo.images![0]).toEqual(
`https://i.ytimg.com/vi/wuClZjOdT30/maxresdefault.jpg`,
);
expect(linkInfo.videos.length).toEqual(0);
expect(linkInfo.favicons[0]).not.toBe(``);
expect(linkInfo.contentType.toLowerCase()).toEqual(`text/html`);
expect(linkInfo.videos!.length).toEqual(0);
expect(linkInfo.favicons![0]).not.toBe(``);
expect(linkInfo.contentType!.toLowerCase()).toEqual(`text/html`);
expect(linkInfo.charset?.toLowerCase()).toEqual(`utf-8`);
});

Expand Down Expand Up @@ -82,7 +82,7 @@ describe(`#getLinkPreview()`, () => {
);
expect(linkInfo.mediaType).toEqual(`audio`);
expect(linkInfo.contentType?.toLowerCase()).toEqual(`audio/mpeg`);
expect(linkInfo.favicons[0]).toBeTruthy();
expect(linkInfo.favicons![0]).toBeTruthy();
});

it(`should handle video urls`, async () => {
Expand All @@ -93,7 +93,7 @@ describe(`#getLinkPreview()`, () => {
expect(linkInfo.url).toEqual(`https://www.w3schools.com/html/mov_bbb.mp4`);
expect(linkInfo.mediaType).toEqual(`video`);
expect(linkInfo.contentType?.toLowerCase()).toEqual(`video/mp4`);
expect(linkInfo.favicons[0]).toBeTruthy();
expect(linkInfo.favicons![0]).toBeTruthy();
});

it(`should handle image urls`, async () => {
Expand All @@ -106,7 +106,7 @@ describe(`#getLinkPreview()`, () => {
);
expect(linkInfo.mediaType).toEqual(`image`);
expect(linkInfo.contentType?.toLowerCase()).toEqual(`image/jpeg`);
expect(linkInfo.favicons[0]).toBeTruthy();
expect(linkInfo.favicons![0]).toBeTruthy();
});

it(`should handle unknown content type urls`, async () => {
Expand All @@ -127,7 +127,7 @@ describe(`#getLinkPreview()`, () => {
);
expect(linkInfo.mediaType).toEqual(`application`);
expect(linkInfo.contentType?.toLowerCase()).toEqual(`application/pdf`);
expect(linkInfo.favicons[0]).toBeTruthy();
expect(linkInfo.favicons![0]).toBeTruthy();
});

it(`no link in text should fail gracefully`, async () => {
Expand Down Expand Up @@ -235,7 +235,7 @@ describe(`#getLinkPreview()`, () => {
});

it("should handle video tags without type or secure_url tags", async () => {
const res: any = await getLinkPreview(
const res = await getLinkPreview(
`https://newpathtitle.com/falling-markets-how-to-stop-buyer-from-getting-out/`,
{ followRedirects: `follow` },
);
Expand All @@ -246,19 +246,41 @@ describe(`#getLinkPreview()`, () => {
);
expect(res.description).toBeTruthy();
expect(res.mediaType).toEqual(`article`);
expect(res.images.length).toBeGreaterThan(0);
expect(res.videos.length).toBeGreaterThan(0);
expect(res.videos[0].url).toEqual(
expect(res.images!.length).toBeGreaterThan(0);
expect(res.videos!.length).toBeGreaterThan(0);
expect(res.videos![0].url).toEqual(
`https://www.youtube.com/embed/nqNXjxpAPkU`,
);
expect(res.favicons.length).toBeGreaterThan(0);
expect(res.contentType.toLowerCase()).toEqual(`text/html`);
expect(res.favicons!.length).toBeGreaterThan(0);
expect(res.contentType!.toLowerCase()).toEqual(`text/html`);
});

it("should auto detect mp4 even without a content type or file extension", async () => {
const res = await getLinkPreview(
"https://storage.googleapis.com/test-stubs/sample_mp4_without_extension",
);

expect(res.mediaType).toEqual(`video`);
expect(res.contentType).toEqual(`video/mp4`);
});

it("should throw exception if URL is not valid", async () => {
await expect(
getLinkPreview(
"https://storagenotvalid.googleapis.com/test-stubs/sample_mp4_without_extension",
),
).rejects.toThrowErrorMatchingSnapshot();
});
});

describe(`#getPreviewFromContent`, () => {
it(`Basic parsing`, async () => {
const linkInfo: any = await getPreviewFromContent(prefetchedResponse);
const linkInfo: any = await getPreviewFromContent({
...prefetchedResponse,
response: new Response(prefetchedResponse.data, {
headers: prefetchedResponse.headers,
}),
});

expect(linkInfo.url).toEqual(`https://www.youtube.com/watch?v=wuClZjOdT30`);
expect(linkInfo.siteName).toEqual(`YouTube`);
Expand Down
128 changes: 79 additions & 49 deletions index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import cheerio from "cheerio";
import { fetch } from "cross-fetch";
import AbortController from "abort-controller";
import { CONSTANTS } from "./constants";
import { fileTypeFromBuffer } from "file-type";

interface ILinkPreviewOptions {
headers?: Record<string, string>;
Expand All @@ -14,15 +15,32 @@ interface ILinkPreviewOptions {
}

interface IPreFetchedResource {
headers?: Record<string, string>;
status?: number;
imagesPropertyType?: string;
proxyUrl?: string;
url: string;
data?: string;
response?: Response;
response: Response;
}

export type LinkPreview = {
url: string;
title?: string;
siteName?: string | undefined;
description?: string | undefined;
mediaType: string;
contentType: string | undefined;
images?: string[];
videos?: {
url: string | undefined;
secureUrl: string | null | undefined;
type: string | null | undefined;
width: string | undefined;
height: string | undefined;
}[];
favicons?: URL[];
charset?: string;
};

/**
*
* @param address
Expand Down Expand Up @@ -352,7 +370,7 @@ function parseTextResponse(
url: string,
options: ILinkPreviewOptions = {},
contentType?: string,
) {
): LinkPreview {
const doc = cheerio.load(body);

return {
Expand All @@ -368,38 +386,37 @@ function parseTextResponse(
};
}

// TODO: can use file-type package to determine mime type based on magic numbers
/**
*
* @param body
* @param url
* @param options
* @param contentType
*/
function parseUnknownResponse(
body: string,
url: string,
options: ILinkPreviewOptions = {},
contentType?: string,
) {
return parseTextResponse(body, url, options, contentType);
}
/// Read SAMPLE_SIZE bytes for file type as an ArrayBuffer
const readBytesForFileType = async (response: Response) => {
// We get this from the file-type package as the sample size
const SAMPLE_SIZE = 4100;

/**
*
* @param response
*/
async function getData(response: IPreFetchedResource) {
if (response.data) {
return response.data;
// If the body doesn't have a reader then we use get the array buffer directly from the response
if (!response.body || !response.body.getReader) {
return await response.arrayBuffer();
}

if (response.response) {
return await response.response.text();
const reader = response.body.getReader();

// we use the streaming API to aggregate the first append the first SAMPLE_SIZE bytes
// from the response
const buffer = new Uint8Array(SAMPLE_SIZE);
let offset = 0;
let chunk;
while (!(chunk = await reader.read()).done) {
if (chunk.value.length + offset > SAMPLE_SIZE) {
const subChunk = chunk.value.subarray(0, SAMPLE_SIZE - offset);
buffer.set(subChunk, offset);
offset = SAMPLE_SIZE;
break;
} else {
buffer.set(chunk.value, offset);
offset += chunk.value.length;
}
}

throw new Error(`link-preview-js could not fetch link information`);
}
return buffer.subarray(0, offset);
};

/**
*
Expand All @@ -409,23 +426,32 @@ async function getData(response: IPreFetchedResource) {
async function parseResponse(
response: IPreFetchedResource,
options?: ILinkPreviewOptions,
) {
): Promise<LinkPreview> {
if (!response.response.ok) {
throw new Error(
`link-preview-js unexpected status in response ${response.response.status} ${response.response.statusText}`,
);
}

try {
// console.log("[link-preview-js] response", response);
let contentType = response.response
? response.response.headers.get(`content-type`)
: response.headers
? response.headers[`content-type`]
: null;
let contentType = response.response.headers.get(`content-type`);
let contentTypeTokens: string[] = [];
let charset = null;

if (!contentType) {
return parseUnknownResponse(
await getData(response),
response.url,
options,
);
let charset;

// If the content type is sufficiently vague, then use the file type package to
// determine the content type via magic numbers.
if (
!contentType ||
["application/octet-stream", "video", "audio"].includes(contentType)
) {
const buffer = await readBytesForFileType(response.response);
const fileType = await fileTypeFromBuffer(buffer);
if (!fileType) {
const text = new TextDecoder().decode(buffer);
return parseTextResponse(text, response.url, options);
} else {
contentType = fileType.mime;
}
}

if (contentType.includes(`;`)) {
Expand Down Expand Up @@ -455,7 +481,7 @@ async function parseResponse(
if (CONSTANTS.REGEX_CONTENT_TYPE_TEXT.test(contentType)) {
return {
...parseTextResponse(
await getData(response),
await response.response.text(),
response.url,
options,
contentType,
Expand All @@ -472,7 +498,11 @@ async function parseResponse(
}

return {
...parseUnknownResponse(await getData(response), response.url, options),
...(await parseTextResponse(
await response.response.text(),
response.url,
options,
)),
charset,
};
} catch (e) {
Expand All @@ -494,7 +524,7 @@ async function parseResponse(
export async function getLinkPreview(
text: string,
options?: ILinkPreviewOptions,
) {
): Promise<LinkPreview> {
if (!text || typeof text !== `string`) {
throw new Error(`link-preview-js did not receive a valid url or text`);
}
Expand Down
12 changes: 8 additions & 4 deletions jest.config.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import type { Config } from "jest";

const config: Config = {
preset: `ts-jest`,
testEnvironment: `node`,
modulePathIgnorePatterns: ["<rootDir>/build/"],
extensionsToTreatAsEsm: [".ts"],
globals: { "ts-jest": { diagnostics: false } },
modulePathIgnorePatterns: ["<rootDir>/build/", "<rootDir>/node_modules/"],
preset: "ts-jest",
testEnvironment: "node",
globals: {
"ts-jest": {
useESM: true,
},
},
};

export default config;
Loading

0 comments on commit a53e371

Please sign in to comment.