Merge branch 'main' into feat/DCP-1814-publish-package-to-gpr

Signed-off-by: Vinayak Kulkarni <[email protected]>
dscvr-one · Apr 29, 2024 · e0f210a · e0f210a
2 parents 7e9289c + a53e371
commit e0f210a
Show file tree

Hide file tree

Showing 2 changed files with 129 additions and 62 deletions.
diff --git a/__tests__/index.spec.ts b/__tests__/index.spec.ts
@@ -4,7 +4,7 @@ import prefetchedResponse from './sampleResponse.json' assert { type: 'json' };
 
 describe(`#getLinkPreview()`, () => {
   it(`should extract link info from just URL`, async () => {
-    const linkInfo: any = await getLinkPreview(
+    const linkInfo = await getLinkPreview(
       `https://www.youtube.com/watch?v=wuClZjOdT30`,
       { headers: { 'Accept-Language': `en-US` } },
     );
@@ -14,13 +14,13 @@ describe(`#getLinkPreview()`, () => {
     expect(linkInfo.title).toEqual(`Geography Now! Germany`);
     expect(linkInfo.description).toBeTruthy();
     expect(linkInfo.mediaType).toEqual(`video.other`);
-    expect(linkInfo.images.length).toEqual(1);
-    expect(linkInfo.images[0]).toEqual(
+    expect(linkInfo.images!.length).toEqual(1);
+    expect(linkInfo.images![0]).toEqual(
       `https://i.ytimg.com/vi/wuClZjOdT30/maxresdefault.jpg`,
     );
-    expect(linkInfo.videos.length).toEqual(0);
-    expect(linkInfo.favicons[0]).not.toBe(``);
-    expect(linkInfo.contentType.toLowerCase()).toEqual(`text/html`);
+    expect(linkInfo.videos!.length).toEqual(0);
+    expect(linkInfo.favicons![0]).not.toBe(``);
+    expect(linkInfo.contentType!.toLowerCase()).toEqual(`text/html`);
     expect(linkInfo.charset?.toLowerCase()).toEqual(`utf-8`);
   });
 
@@ -83,7 +83,7 @@ describe(`#getLinkPreview()`, () => {
     );
     expect(linkInfo.mediaType).toEqual(`audio`);
     expect(linkInfo.contentType?.toLowerCase()).toEqual(`audio/mpeg`);
-    expect(linkInfo.favicons[0]).toBeTruthy();
+    expect(linkInfo.favicons![0]).toBeTruthy();
   });
 
   it(`should handle video urls`, async () => {
@@ -94,7 +94,7 @@ describe(`#getLinkPreview()`, () => {
     expect(linkInfo.url).toEqual(`https://www.w3schools.com/html/mov_bbb.mp4`);
     expect(linkInfo.mediaType).toEqual(`video`);
     expect(linkInfo.contentType?.toLowerCase()).toEqual(`video/mp4`);
-    expect(linkInfo.favicons[0]).toBeTruthy();
+    expect(linkInfo.favicons![0]).toBeTruthy();
   });
 
   it(`should handle image urls`, async () => {
@@ -107,7 +107,7 @@ describe(`#getLinkPreview()`, () => {
     );
     expect(linkInfo.mediaType).toEqual(`image`);
     expect(linkInfo.contentType?.toLowerCase()).toEqual(`image/jpeg`);
-    expect(linkInfo.favicons[0]).toBeTruthy();
+    expect(linkInfo.favicons![0]).toBeTruthy();
   });
 
   it(`should handle unknown content type urls`, async () => {
@@ -128,7 +128,7 @@ describe(`#getLinkPreview()`, () => {
     );
     expect(linkInfo.mediaType).toEqual(`application`);
     expect(linkInfo.contentType?.toLowerCase()).toEqual(`application/pdf`);
-    expect(linkInfo.favicons[0]).toBeTruthy();
+    expect(linkInfo.favicons![0]).toBeTruthy();
   });
 
   it(`no link in text should fail gracefully`, async () => {
@@ -236,7 +236,7 @@ describe(`#getLinkPreview()`, () => {
   });
 
   it('should handle video tags without type or secure_url tags', async () => {
-    const res: any = await getLinkPreview(
+    const res = await getLinkPreview(
       `https://newpathtitle.com/falling-markets-how-to-stop-buyer-from-getting-out/`,
       { followRedirects: `follow` },
     );
@@ -247,19 +247,41 @@ describe(`#getLinkPreview()`, () => {
     );
     expect(res.description).toBeTruthy();
     expect(res.mediaType).toEqual(`article`);
-    expect(res.images.length).toBeGreaterThan(0);
-    expect(res.videos.length).toBeGreaterThan(0);
-    expect(res.videos[0].url).toEqual(
+    expect(res.images!.length).toBeGreaterThan(0);
+    expect(res.videos!.length).toBeGreaterThan(0);
+    expect(res.videos![0].url).toEqual(
       `https://www.youtube.com/embed/nqNXjxpAPkU`,
     );
-    expect(res.favicons.length).toBeGreaterThan(0);
-    expect(res.contentType.toLowerCase()).toEqual(`text/html`);
+    expect(res.favicons!.length).toBeGreaterThan(0);
+    expect(res.contentType!.toLowerCase()).toEqual(`text/html`);
+  });
+
+  it('should auto detect mp4 even without a content type or file extension', async () => {
+    const res = await getLinkPreview(
+      'https://storage.googleapis.com/test-stubs/sample_mp4_without_extension',
+    );
+
+    expect(res.mediaType).toEqual(`video`);
+    expect(res.contentType).toEqual(`video/mp4`);
+  });
+
+  it('should throw exception if URL is not valid', async () => {
+    await expect(
+      getLinkPreview(
+        'https://storagenotvalid.googleapis.com/test-stubs/sample_mp4_without_extension',
+      ),
+    ).rejects.toThrowErrorMatchingSnapshot();
   });
 });
 
 describe(`#getPreviewFromContent`, () => {
   it(`Basic parsing`, async () => {
-    const linkInfo: any = await getPreviewFromContent(prefetchedResponse);
+    const linkInfo: any = await getPreviewFromContent({
+      ...prefetchedResponse,
+      response: new Response(prefetchedResponse.data, {
+        headers: prefetchedResponse.headers,
+      }),
+    });
 
     expect(linkInfo.url).toEqual(`https://www.youtube.com/watch?v=wuClZjOdT30`);
     expect(linkInfo.siteName).toEqual(`YouTube`);

diff --git a/index.ts b/index.ts
@@ -2,25 +2,43 @@ import cheerio from 'cheerio';
 import { fetch } from 'cross-fetch';
 import AbortController from 'abort-controller';
 import { CONSTANTS } from './constants';
+import { fileTypeFromBuffer } from 'file-type';
 
-type ILinkPreviewOptions = {
+interface ILinkPreviewOptions {
   headers?: Record<string, string>;
   imagesPropertyType?: string;
   proxyUrl?: string;
   timeout?: number;
   followRedirects?: `follow` | `error` | `manual`;
   resolveDNSHost?: (url: string) => Promise<string>;
   handleRedirects?: (baseURL: string, forwardedURL: string) => boolean;
-};
+}
 
-type IPreFetchedResource = {
-  headers?: Record<string, string>;
+interface IPreFetchedResource {
   status?: number;
   imagesPropertyType?: string;
   proxyUrl?: string;
   url: string;
-  data?: string;
-  response?: Response;
+  response: Response;
+}
+
+export type LinkPreview = {
+  url: string;
+  title?: string;
+  siteName?: string | undefined;
+  description?: string | undefined;
+  mediaType: string;
+  contentType: string | undefined;
+  images?: string[];
+  videos?: {
+    url: string | undefined;
+    secureUrl: string | null | undefined;
+    type: string | null | undefined;
+    width: string | undefined;
+    height: string | undefined;
+  }[];
+  favicons?: URL[];
+  charset?: string;
 };
 
 const throwOnLoopback = (address: string) => {
@@ -63,7 +81,11 @@ const getDescription = (doc: cheerio.Root) => {
   return description;
 };
 
-const getMediaType = (doc: cheerio.Root) => {
+/**
+ *
+ * @param doc
+ */
+function getMediaType(doc: cheerio.Root) {
   const node = metaTag(doc, `medium`, `name`);
   if (node) {
     const content = node.attr(`content`);
@@ -73,7 +95,7 @@ const getMediaType = (doc: cheerio.Root) => {
     metaTagContent(doc, `og:type`, `property`) ||
     metaTagContent(doc, `og:type`, `name`)
   );
-};
+}
 
 const getImages = (
   doc: cheerio.Root,
@@ -274,7 +296,7 @@ const parseTextResponse = (
   url: string,
   options: ILinkPreviewOptions = {},
   contentType?: string,
-) => {
+): LinkPreview => {
   const doc = cheerio.load(body);
 
   return {
@@ -290,48 +312,67 @@ const parseTextResponse = (
   };
 };
 
-// TODO: can use file-type package to determine mime type based on magic numbers
-const parseUnknownResponse = (
-  body: string,
-  url: string,
-  options: ILinkPreviewOptions = {},
-  contentType?: string,
-) => {
-  return parseTextResponse(body, url, options, contentType);
-};
+/// Read SAMPLE_SIZE bytes for file type as an ArrayBuffer
+const readBytesForFileType = async (response: Response) => {
+  // We get this from the file-type package as the sample size
+  const SAMPLE_SIZE = 4100;
 
-const getData = async (response: IPreFetchedResource) => {
-  if (response.data) {
-    return response.data;
+  // If the body doesn't have a reader then we use get the array buffer directly from the response
+  if (!response.body || !response.body.getReader) {
+    return await response.arrayBuffer();
   }
 
-  if (response.response) {
-    return await response.response.text();
+  const reader = response.body.getReader();
+
+  // we use the streaming API to aggregate the first append the first SAMPLE_SIZE bytes
+  // from the response
+  const buffer = new Uint8Array(SAMPLE_SIZE);
+  let offset = 0;
+  let chunk;
+  while (!(chunk = await reader.read()).done) {
+    if (chunk.value.length + offset > SAMPLE_SIZE) {
+      const subChunk = chunk.value.subarray(0, SAMPLE_SIZE - offset);
+      buffer.set(subChunk, offset);
+      offset = SAMPLE_SIZE;
+      break;
+    } else {
+      buffer.set(chunk.value, offset);
+      offset += chunk.value.length;
+    }
   }
 
-  throw new Error(`link-preview-js could not fetch link information`);
+  return buffer.subarray(0, offset);
 };
 
 const parseResponse = async (
   response: IPreFetchedResource,
   options?: ILinkPreviewOptions,
-) => {
+): Promise<LinkPreview> => {
+  if (!response.response.ok) {
+    throw new Error(
+      `link-preview-js unexpected status in response ${response.response.status} ${response.response.statusText}`,
+    );
+  }
+
   try {
-    // console.log("[link-preview-js] response", response);
-    let contentType = response.response
-      ? response.response.headers.get(`content-type`)
-      : response.headers
-        ? response.headers[`content-type`]
-        : null;
+    let contentType = response.response.headers.get(`content-type`);
     let contentTypeTokens: string[] = [];
-    let charset = null;
-
-    if (!contentType) {
-      return parseUnknownResponse(
-        await getData(response),
-        response.url,
-        options,
-      );
+    let charset;
+
+    // If the content type is sufficiently vague, then use the file type package to
+    // determine the content type via magic numbers.
+    if (
+      !contentType ||
+      ['application/octet-stream', 'video', 'audio'].includes(contentType)
+    ) {
+      const buffer = await readBytesForFileType(response.response);
+      const fileType = await fileTypeFromBuffer(buffer);
+      if (!fileType) {
+        const text = new TextDecoder().decode(buffer);
+        return parseTextResponse(text, response.url, options);
+      } else {
+        contentType = fileType.mime;
+      }
     }
 
     if (contentType.includes(`;`)) {
@@ -361,7 +402,7 @@ const parseResponse = async (
     if (CONSTANTS.REGEX_CONTENT_TYPE_TEXT.test(contentType)) {
       return {
         ...parseTextResponse(
-          await getData(response),
+          await response.response.text(),
           response.url,
           options,
           contentType,
@@ -378,7 +419,11 @@ const parseResponse = async (
     }
 
     return {
-      ...parseUnknownResponse(await getData(response), response.url, options),
+      ...(await parseTextResponse(
+        await response.response.text(),
+        response.url,
+        options,
+      )),
       charset,
     };
   } catch (e) {
@@ -390,13 +435,13 @@ const parseResponse = async (
   }
 };
 
-// Parses the text, extracts the first link it finds and does a HTTP request
-// to fetch the website content, afterwards it tries to parse the internal HTML
-// and extract the information via meta tags
+//  Parses the text, extracts the first link it finds and does a HTTP request
+//  to fetch the website content, afterwards it tries to parse the internal HTML
+//  and extract the information via meta tags
 export const getLinkPreview = async (
   text: string,
   options?: ILinkPreviewOptions,
-) => {
+): Promise<LinkPreview> => {
   if (!text || typeof text !== `string`) {
     throw new Error(`link-preview-js did not receive a valid url or text`);
   }