feat: add capability to autodetect if content header is ambiguous + w…

…rap up esm transition (#1)
dscvr-one · Apr 29, 2024 · a53e371 · a53e371
1 parent 6e1f487
commit a53e371
Show file tree

Hide file tree

Showing 7 changed files with 402 additions and 244 deletions.
diff --git a/.github/workflows/test_workflow.yml b/.github/workflows/test_workflow.yml
@@ -10,7 +10,7 @@ jobs:
     - uses: actions/checkout@v2
 
     - name: Install dependencies
-      run: yarn
+      run: npm
 
     - name: Run tests
       uses: ospfranco/[email protected]

diff --git a/__tests__/__snapshots__/index.spec.ts.snap b/__tests__/__snapshots__/index.spec.ts.snap
@@ -5,3 +5,5 @@ exports[`#getLinkPreview() no link in text should fail gracefully 1`] = `"link-p
 exports[`#getLinkPreview() should handle empty strings gracefully 1`] = `"link-preview-js did not receive a valid url or text"`;
 
 exports[`#getLinkPreview() should handle malformed urls gracefully 1`] = `"link-preview-js did not receive a valid a url or text"`;
+
+exports[`#getLinkPreview() should throw exception if URL is not valid 1`] = `"link-preview-js unexpected status in response 404 Not Found"`;
diff --git a/__tests__/index.spec.ts b/__tests__/index.spec.ts
@@ -3,7 +3,7 @@ import prefetchedResponse from "./sampleResponse.json";
 
 describe(`#getLinkPreview()`, () => {
   it(`should extract link info from just URL`, async () => {
-    const linkInfo: any = await getLinkPreview(
+    const linkInfo = await getLinkPreview(
       `https://www.youtube.com/watch?v=wuClZjOdT30`,
       { headers: { "Accept-Language": `en-US` } },
     );
@@ -13,13 +13,13 @@ describe(`#getLinkPreview()`, () => {
     expect(linkInfo.title).toEqual(`Geography Now! Germany`);
     expect(linkInfo.description).toBeTruthy();
     expect(linkInfo.mediaType).toEqual(`video.other`);
-    expect(linkInfo.images.length).toEqual(1);
-    expect(linkInfo.images[0]).toEqual(
+    expect(linkInfo.images!.length).toEqual(1);
+    expect(linkInfo.images![0]).toEqual(
       `https://i.ytimg.com/vi/wuClZjOdT30/maxresdefault.jpg`,
     );
-    expect(linkInfo.videos.length).toEqual(0);
-    expect(linkInfo.favicons[0]).not.toBe(``);
-    expect(linkInfo.contentType.toLowerCase()).toEqual(`text/html`);
+    expect(linkInfo.videos!.length).toEqual(0);
+    expect(linkInfo.favicons![0]).not.toBe(``);
+    expect(linkInfo.contentType!.toLowerCase()).toEqual(`text/html`);
     expect(linkInfo.charset?.toLowerCase()).toEqual(`utf-8`);
   });
 
@@ -82,7 +82,7 @@ describe(`#getLinkPreview()`, () => {
     );
     expect(linkInfo.mediaType).toEqual(`audio`);
     expect(linkInfo.contentType?.toLowerCase()).toEqual(`audio/mpeg`);
-    expect(linkInfo.favicons[0]).toBeTruthy();
+    expect(linkInfo.favicons![0]).toBeTruthy();
   });
 
   it(`should handle video urls`, async () => {
@@ -93,7 +93,7 @@ describe(`#getLinkPreview()`, () => {
     expect(linkInfo.url).toEqual(`https://www.w3schools.com/html/mov_bbb.mp4`);
     expect(linkInfo.mediaType).toEqual(`video`);
     expect(linkInfo.contentType?.toLowerCase()).toEqual(`video/mp4`);
-    expect(linkInfo.favicons[0]).toBeTruthy();
+    expect(linkInfo.favicons![0]).toBeTruthy();
   });
 
   it(`should handle image urls`, async () => {
@@ -106,7 +106,7 @@ describe(`#getLinkPreview()`, () => {
     );
     expect(linkInfo.mediaType).toEqual(`image`);
     expect(linkInfo.contentType?.toLowerCase()).toEqual(`image/jpeg`);
-    expect(linkInfo.favicons[0]).toBeTruthy();
+    expect(linkInfo.favicons![0]).toBeTruthy();
   });
 
   it(`should handle unknown content type urls`, async () => {
@@ -127,7 +127,7 @@ describe(`#getLinkPreview()`, () => {
     );
     expect(linkInfo.mediaType).toEqual(`application`);
     expect(linkInfo.contentType?.toLowerCase()).toEqual(`application/pdf`);
-    expect(linkInfo.favicons[0]).toBeTruthy();
+    expect(linkInfo.favicons![0]).toBeTruthy();
   });
 
   it(`no link in text should fail gracefully`, async () => {
@@ -235,7 +235,7 @@ describe(`#getLinkPreview()`, () => {
   });
 
   it("should handle video tags without type or secure_url tags", async () => {
-    const res: any = await getLinkPreview(
+    const res = await getLinkPreview(
       `https://newpathtitle.com/falling-markets-how-to-stop-buyer-from-getting-out/`,
       { followRedirects: `follow` },
     );
@@ -246,19 +246,41 @@ describe(`#getLinkPreview()`, () => {
     );
     expect(res.description).toBeTruthy();
     expect(res.mediaType).toEqual(`article`);
-    expect(res.images.length).toBeGreaterThan(0);
-    expect(res.videos.length).toBeGreaterThan(0);
-    expect(res.videos[0].url).toEqual(
+    expect(res.images!.length).toBeGreaterThan(0);
+    expect(res.videos!.length).toBeGreaterThan(0);
+    expect(res.videos![0].url).toEqual(
       `https://www.youtube.com/embed/nqNXjxpAPkU`,
     );
-    expect(res.favicons.length).toBeGreaterThan(0);
-    expect(res.contentType.toLowerCase()).toEqual(`text/html`);
+    expect(res.favicons!.length).toBeGreaterThan(0);
+    expect(res.contentType!.toLowerCase()).toEqual(`text/html`);
+  });
+
+  it("should auto detect mp4 even without a content type or file extension", async () => {
+    const res = await getLinkPreview(
+      "https://storage.googleapis.com/test-stubs/sample_mp4_without_extension",
+    );
+
+    expect(res.mediaType).toEqual(`video`);
+    expect(res.contentType).toEqual(`video/mp4`);
+  });
+
+  it("should throw exception if URL is not valid", async () => {
+    await expect(
+      getLinkPreview(
+        "https://storagenotvalid.googleapis.com/test-stubs/sample_mp4_without_extension",
+      ),
+    ).rejects.toThrowErrorMatchingSnapshot();
   });
 });
 
 describe(`#getPreviewFromContent`, () => {
   it(`Basic parsing`, async () => {
-    const linkInfo: any = await getPreviewFromContent(prefetchedResponse);
+    const linkInfo: any = await getPreviewFromContent({
+      ...prefetchedResponse,
+      response: new Response(prefetchedResponse.data, {
+        headers: prefetchedResponse.headers,
+      }),
+    });
 
     expect(linkInfo.url).toEqual(`https://www.youtube.com/watch?v=wuClZjOdT30`);
     expect(linkInfo.siteName).toEqual(`YouTube`);

diff --git a/index.ts b/index.ts
@@ -2,6 +2,7 @@ import cheerio from "cheerio";
 import { fetch } from "cross-fetch";
 import AbortController from "abort-controller";
 import { CONSTANTS } from "./constants";
+import { fileTypeFromBuffer } from "file-type";
 
 interface ILinkPreviewOptions {
   headers?: Record<string, string>;
@@ -14,15 +15,32 @@ interface ILinkPreviewOptions {
 }
 
 interface IPreFetchedResource {
-  headers?: Record<string, string>;
   status?: number;
   imagesPropertyType?: string;
   proxyUrl?: string;
   url: string;
-  data?: string;
-  response?: Response;
+  response: Response;
 }
 
+export type LinkPreview = {
+  url: string;
+  title?: string;
+  siteName?: string | undefined;
+  description?: string | undefined;
+  mediaType: string;
+  contentType: string | undefined;
+  images?: string[];
+  videos?: {
+    url: string | undefined;
+    secureUrl: string | null | undefined;
+    type: string | null | undefined;
+    width: string | undefined;
+    height: string | undefined;
+  }[];
+  favicons?: URL[];
+  charset?: string;
+};
+
 /**
  *
  * @param address
@@ -352,7 +370,7 @@ function parseTextResponse(
   url: string,
   options: ILinkPreviewOptions = {},
   contentType?: string,
-) {
+): LinkPreview {
   const doc = cheerio.load(body);
 
   return {
@@ -368,38 +386,37 @@ function parseTextResponse(
   };
 }
 
-// TODO: can use file-type package to determine mime type based on magic numbers
-/**
- *
- * @param body
- * @param url
- * @param options
- * @param contentType
- */
-function parseUnknownResponse(
-  body: string,
-  url: string,
-  options: ILinkPreviewOptions = {},
-  contentType?: string,
-) {
-  return parseTextResponse(body, url, options, contentType);
-}
+/// Read SAMPLE_SIZE bytes for file type as an ArrayBuffer
+const readBytesForFileType = async (response: Response) => {
+  // We get this from the file-type package as the sample size
+  const SAMPLE_SIZE = 4100;
 
-/**
- *
- * @param response
- */
-async function getData(response: IPreFetchedResource) {
-  if (response.data) {
-    return response.data;
+  // If the body doesn't have a reader then we use get the array buffer directly from the response
+  if (!response.body || !response.body.getReader) {
+    return await response.arrayBuffer();
   }
 
-  if (response.response) {
-    return await response.response.text();
+  const reader = response.body.getReader();
+
+  // we use the streaming API to aggregate the first append the first SAMPLE_SIZE bytes
+  // from the response
+  const buffer = new Uint8Array(SAMPLE_SIZE);
+  let offset = 0;
+  let chunk;
+  while (!(chunk = await reader.read()).done) {
+    if (chunk.value.length + offset > SAMPLE_SIZE) {
+      const subChunk = chunk.value.subarray(0, SAMPLE_SIZE - offset);
+      buffer.set(subChunk, offset);
+      offset = SAMPLE_SIZE;
+      break;
+    } else {
+      buffer.set(chunk.value, offset);
+      offset += chunk.value.length;
+    }
   }
 
-  throw new Error(`link-preview-js could not fetch link information`);
-}
+  return buffer.subarray(0, offset);
+};
 
 /**
  *
@@ -409,23 +426,32 @@ async function getData(response: IPreFetchedResource) {
 async function parseResponse(
   response: IPreFetchedResource,
   options?: ILinkPreviewOptions,
-) {
+): Promise<LinkPreview> {
+  if (!response.response.ok) {
+    throw new Error(
+      `link-preview-js unexpected status in response ${response.response.status} ${response.response.statusText}`,
+    );
+  }
+
   try {
-    // console.log("[link-preview-js] response", response);
-    let contentType = response.response
-      ? response.response.headers.get(`content-type`)
-      : response.headers
-        ? response.headers[`content-type`]
-        : null;
+    let contentType = response.response.headers.get(`content-type`);
     let contentTypeTokens: string[] = [];
-    let charset = null;
-
-    if (!contentType) {
-      return parseUnknownResponse(
-        await getData(response),
-        response.url,
-        options,
-      );
+    let charset;
+
+    // If the content type is sufficiently vague, then use the file type package to
+    // determine the content type via magic numbers.
+    if (
+      !contentType ||
+      ["application/octet-stream", "video", "audio"].includes(contentType)
+    ) {
+      const buffer = await readBytesForFileType(response.response);
+      const fileType = await fileTypeFromBuffer(buffer);
+      if (!fileType) {
+        const text = new TextDecoder().decode(buffer);
+        return parseTextResponse(text, response.url, options);
+      } else {
+        contentType = fileType.mime;
+      }
     }
 
     if (contentType.includes(`;`)) {
@@ -455,7 +481,7 @@ async function parseResponse(
     if (CONSTANTS.REGEX_CONTENT_TYPE_TEXT.test(contentType)) {
       return {
         ...parseTextResponse(
-          await getData(response),
+          await response.response.text(),
           response.url,
           options,
           contentType,
@@ -472,7 +498,11 @@ async function parseResponse(
     }
 
     return {
-      ...parseUnknownResponse(await getData(response), response.url, options),
+      ...(await parseTextResponse(
+        await response.response.text(),
+        response.url,
+        options,
+      )),
       charset,
     };
   } catch (e) {
@@ -494,7 +524,7 @@ async function parseResponse(
 export async function getLinkPreview(
   text: string,
   options?: ILinkPreviewOptions,
-) {
+): Promise<LinkPreview> {
   if (!text || typeof text !== `string`) {
     throw new Error(`link-preview-js did not receive a valid url or text`);
   }

diff --git a/jest.config.ts b/jest.config.ts
@@ -1,11 +1,15 @@
 import type { Config } from "jest";
 
 const config: Config = {
-  preset: `ts-jest`,
-  testEnvironment: `node`,
+  modulePathIgnorePatterns: ["<rootDir>/build/"],
   extensionsToTreatAsEsm: [".ts"],
-  globals: { "ts-jest": { diagnostics: false } },
-  modulePathIgnorePatterns: ["<rootDir>/build/", "<rootDir>/node_modules/"],
+  preset: "ts-jest",
+  testEnvironment: "node",
+  globals: {
+    "ts-jest": {
+      useESM: true,
+    },
+  },
 };
 
 export default config;
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,3 +5,5 @@ exports[`#getLinkPreview() no link in text should fail gracefully 1`] = `"link-p
		exports[`#getLinkPreview() should handle empty strings gracefully 1`] = `"link-preview-js did not receive a valid url or text"`;

		exports[`#getLinkPreview() should handle malformed urls gracefully 1`] = `"link-preview-js did not receive a valid a url or text"`;

		exports[`#getLinkPreview() should throw exception if URL is not valid 1`] = `"link-preview-js unexpected status in response 404 Not Found"`;